def nnps_and_keywords(text): s = parsetree(text, relations=True, lemmata=True) nnp_kw = {} for e in s: d = Document(e) kw = d.keywords() nnp = set() for w in kw: if w[1].type == 'NNP': wdstr = [] for wd in w[1].phrase.words: if wd.type == 'NNP': wdstr.append(wd.string) nnp.add("-".join(wdstr)) kw = d.keywords(top=5) words = set() for w in kw: if w[1].type != 'NNP': if w[1].lemma: words.add(w[1].lemma) else: words.add(w[1].string) if len(nnp)>1 and len(words)>1: if tuple(nnp) in nnp_kw: nnp_kw[tuple(nnp)].update(words) else: nnp_kw[tuple(nnp)]=words return nnp_kw
def get_keywords_article(article): tagged_content_words = ([ i.Word for i in article.tagged_content if i.Tag.startswith('NN') ]) d = Document(tagged_content_words) k = d.keywords(top=5) article.keywords = k
def run(self,minePackage): ac=0.0 #acierto clave ap=0.0 #acierto positivo an=0.0 #acierto negativo alpha=1.00 beta=0.75 gamma=0.25 dictionary= open(os.path.dirname(__file__) + "/dictionary.txt",'r').read() dictionary = Document(dictionary, stemmer = PORTER) clouds=minePackage['clouds'] query=minePackage['searchKeyStemmer'] for cloud in clouds: for n in cloud.graph.nodes(): methodData=cloud.graph.node[n]['methodData'] content = Document(methodData.getContent(),stemmer = PORTER) for doc in content.keywords(top=500,normalized=True): if doc[1] in query and doc[1] in dictionary.words: ac += doc[0] elif doc[1] in dictionary.words: ap += doc[0] elif doc[1] in query: an += doc[0] if ac+ap+an > 0: cloud.graph.node[n]['weight_WA']=((ac*alpha)+(ap*beta)+(an*gamma))/(ac+ap+an) else: cloud.graph.node[n]['weight_WA']=0
def confusion_matrix(self, key=None, output_format=None, split=False): """Returns a confusion matrix for the model based on splitting the data set randomly into two pieces, training on one and testing on the other""" if split: list_of_dependent = self.dependent_in_use(key=key) else: list_of_dependent = [None] output = '' matrices = dict() for current_dep in list_of_dependent: testing_set = list() model = self._learner() for record in self.classified_entries(key=key): if split: dep_result = str(record.dependent == current_dep) else: dep_result = record.dependent if random.random() < 0.5: model.train(Document(record.independent.lower(), stemmer=PORTER), dep_result) else: testing_set.append((Document(record.independent.lower(), stemmer=PORTER), dep_result)) matrix = model.confusion_matrix(documents=testing_set) matrices[current_dep] = matrix if output_format == 'html': if split: output += '<h4>' + current_dep + "</h4>" vals = matrix.keys() output += '<table class="table table-bordered"><thead><tr><td></td><td></td><td style="text-align: center" colspan="' + str(len(vals)) + '">Actual</td></tr><tr><th></th><th></th>' first = True for val in vals: output += '<th>' + val + '</th>' output += '</tr></thead><tbody>' for val_a in vals: output += '<tr>' if first: output += '<td style="text-align: right; vertical-align: middle;" rowspan="' + str(len(vals)) + '">Predicted</td>' first = False output += '<th>' + val_a + '</th>' for val_b in vals: output += '<td>' + str(matrix[val_b].get(val_a, 0)) + '</td>' output += '</tr>' output += '</tbody></table>' #output += "\n\n`" + str(matrix) + "`" # output += '<ul>' # for document, actual in testing_set: # predicted = model.classify(document) # output += '<li>Predicted: ' + predicted + '; Actual: ' + actual + '</li>' # output += '</ul>' if output_format == 'html': return output if split: ret_val = matrices else: ret_val = matrices[None] if output_format == 'json': return json.dumps(ret_val, sort_keys=True, indent=4) if output_format == 'yaml': return yaml.safe_dump(ret_val, default_flow_style=False) if output_format is None: return ret_val return ret_val
def setup(): global pages global urlalias global revurlalias global knn pages = dict() urlalias = dict() revurlalias = dict() knn = KNN() db = MySQLdb.connect(host="192.168.200.26", user="******", passwd="xxxsecretxxx", db="pla") cur = db.cursor() cur.execute("select source, alias from url_alias") for row in cur.fetchall(): urlalias[row[1]] = row[0] revurlalias[row[0]] = row[1] cur.execute("select tid, name, description, vid from taxonomy_term_data;") for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] url = revurlalias[url] if row[3] == 3: soup = bs4.BeautifulSoup(row[2]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[1].lower()), url) cur.execute( "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);" ) for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) if url in revurlalias: url = revurlalias[url] soup = bs4.BeautifulSoup(row[1]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[2].lower()), url) cur.execute("select nid, title from node where status=1;") for row in cur.fetchall(): url = 'node/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] db.close() pgcur = conn.cursor() pgcur.execute( "select query, target from website_queries where target is not null group by query, target" ) for row in pgcur.fetchall(): words = re.split(r'[\n\r,;]+ *', row[1]) for word in words: print("training on " + row[0].lower() + " for " + word) knn.train(Document(row[0].lower()), word) conn.commit() pgcur.close()
def load_text( self, text ): self.time_start = datetime.datetime.now() self.document_raw = Document( text, threshold=0 ) self.document_raw_count = self.document_raw.count self.document_thresh_stemmed = Document( text, stemmer=PORTER, threshold=1 ) self.document_thresh_unstemmed = Document( text, threshold=1 ) self.original_text = text self.original_text_md5_hash = hashlib.md5(self.original_text.encode(u'utf-8', u'replace')).hexdigest().decode(u'utf-8', u'replace') # takes source-u-string, makes source-string, gets hash-string, makes hash-u-string
def insertarDocumento(self, url, contenido): """ Crea registro en mongodb y un archivo Pattern Document""" unDocumento = Document(contenido, name=url, stopwords=True, stemming=PORTER, weigth=TFIDF) result = self.mongodb.crearDocumento(unDocumento) if result: unDocumento.save("DocumentoPattern/" + str(result.inserted_id)) return unDocumento
def resolve_certainty(certainty_info): '''Resolve certainty with Naive Bayes''' if certainty_info == '': return 'No certainty info.' else: nb = NB() for observation, certainty in csv( 'library/templatetags/c_training_data.csv'): v = Document(observation, type=int(certainty), stopwords=True) nb.train(v) return nb.classify(Document(certainty_info))
def evaluate_query(query): probs = dict() for key, value in knn.classify(Document(query), discrete=False).iteritems(): probs[key] = value if not len(probs): probs[knn.classify(Document(query))] = 1.0 seen = set() probs = map(lambda x: fixurl(x, seen), sorted(probs, key=probs.get, reverse=True)) probs = [prob for prob in probs if prob is not None] return probs
def crearDocumentoPattern(self, contenido, name=""): '''Creacion de documentos eliminando stopwords, aplicando stemming y peso de frecuencias TFIDF''' return Document(contenido, name=name, stemmer=PORTER, stopwords=True, weigth=TFIDF)
def summarize(text, n=1): """ extract most relevant sentences from text according to TextRank algorithm - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dict of TextRank ranking of cosine similarity matrix ranking = utils.textrank(m.documents, m.distance) # indexes of top n sentences top_sents_idx, _ = list(zip(*ranking.most_common(n))) # reordering output = [sentences[i] for i in sorted(top_sents_idx)] return ''.join(output)
def articles_to_trends(articles): news = {} for story in articles: if story['added_at']: article_text = get_article_text(story['url']) d, s = timestamptext(story['added_at'], article_text) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story['summary']) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10)
def _train(self, indep, depend): """Trains the machine learner given an independent variable and a corresponding dependent variable.""" if indep is None: return the_text = re.sub(r'[\n\r]+', r' ', indep).lower() learners[self.group_id].train( Document(the_text.lower(), stemmer=PORTER), depend)
def summarize(text_to_summarize): stokens = tokenize(text_to_summarize) # STEP 1 # pattern.vector's Document is a nifty bag-o-words structure, # with a TF weighting scheme docs = [Document(string= s, name=e,stemmer=LEMMA) for e,s in enumerate(stokens) if len(s.split(" ")) > 7] linkgraph = [] # STEP 2 and 3 happen interwovenly for doc in docs: for doc_copy in docs: if doc.name != doc_copy.name: # STEP 2 happens here wordset_a = [x[1] for x in doc.keywords()] wordset_b = [y[1] for y in doc_copy.keywords()] jacc_dist = distance.jaccard(wordset_a, wordset_b) if jacc_dist < 1: linkgraph.append((str(doc.name), #index to sentence str(doc_copy.name),1-jacc_dist)) #dist. score # By the time we reach here, we'd have completed STEP 3 # STEP 4 #I referenced this SO post for help with pagerank'ing #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx D=nx.DiGraph() D.add_weighted_edges_from(linkgraph) pagerank = nx.pagerank(D) sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1)) sort_pagerank.reverse() top2 = sort_pagerank[:2] orderedtop2 = [int(x[0]) for x in top2] orderedtop2 = sorted(orderedtop2) return " ".join([ stokens[i] for i in orderedtop2 ])
def feeds_to_trends(feeds): for url in feeds: url = url['feed_url'] news = {} try: for story in Newsfeed().search(url, cached=False): d, s = datetext(story.date, story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10) except HTTP404NotFound: print url pass
def doclist_from_feeds(feeds): titles = gettitles(feeds) documents = [] for key in titles: doc = Document(" ".join(titles[key]), stemmer=LEMMA, threshold=0) documents.append(doc) return documents
def predict(self, indep, probabilities=False): """Returns a list of predicted dependent variables for a given independent variable.""" indep = re.sub(r'[\n\r]+', r' ', indep).lower() if not self._train_from_db(): return list() probs = dict() for key, value in learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER), discrete=False).iteritems(): probs[key] = value if not len(probs): single_result = learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER)) if single_result is not None: probs[single_result] = 1.0 if probabilities: return [(x, probs[x]) for x in sorted(probs.keys(), key=probs.get, reverse=True)] else: return sorted(probs.keys(), key=probs.get, reverse=True)
def word_ranking(text, n='L2'): """ extract most relevant sentences from text according to LSA algorithm steps: 1. tokenize text by sentences 2. compute tfidf matrix 3. applying SVD of tfidf matrix (reduce to n-dimensions) 4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf) - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) #============================================================================== # #synctatic filter # exclude_list = [] # for sent in sentences: # for word, pos in tag(sent): # if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns. # exclude_list.append(word.lower()) #============================================================================== # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dimensions number equal to euclidean norm of singular values # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False) # dimensions=int(round(np.linalg.norm(S, 2))) m.reduce(dimensions=n) # sentences selection according to cross-method # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf # topic(rows) x tokens(cols) matrix(tfidf) V = np.array(m.lsa.vt) # average sentence score for each concept/topic by the rows of the Vt matrix avg_score = np.mean(V, axis=1).reshape((-1, 1)) # cell values which are less than or equal to the average score are set to zero V[V <= avg_score] = 0.0 # sigma natrix after svd performing S = np.array(m.lsa.sigma).reshape((-1, 1)) # total length of each sentence vector length = np.sum(V * S, axis=0) # ranking words by length score ranking = Counter(dict(zip(m.lsa.terms, length))) #.most_common(n) #words, score = list(zip(*ranking)) return ranking
def build_model(results=[]): documents = [ Document(i.get('text'), name=i.get('url'), description=i.get('index'), stemmer=LEMMA) for i in results ] m = Model(documents, weight=TFIDF) y, x = 1, len(m.features) model = np.zeros((y, x)) sentence_dict = {} model_sentences = [] for i_index, i in enumerate(documents): sentences = sent_tokenize(results[i_index].get('text').lower()) dy, dx = len(sentences), x for s_index, s in enumerate(sentences): s_words = { w: 1 for w in words(s, stemmer=LEMMA, stopwords=False) if not stopwords_hash.get(w) } if len(s_words) < 5: continue model_sentences.append(s) model = np.append( model, [[1 if s_words.get(w) else 0 for w in m.features]], 0) sentence_dict[model.shape[0] - 1] = i.name # model_sentences[model.shape[0]-1] = s model = np.delete(model, (0), 0) return model, m, model_sentences, sentence_dict
def get_labeled_feats(self, data): labeled_binary = [] for (word, tag) in data: feat = FeatExtract( word, ArtOrDet=(self.error_tag == 'ArtOrDet')).binary_features() d = Document(feat, type=tag, stopwords=True) labeled_binary.append(d) return labeled_binary
def getMod(): essay_path = 'essays/original/' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] for f in files: with io.open(f, 'r', encoding='utf-8') as w: text = TextBlob(PageParser.parse(w.read())) text = ' '.join([ word for word in text.words if word not in cachedStopWords ]).lstrip() #ent_text = ' '.join(er.recognize_entities(text.sentences)) #ent_text = PageParser.parse(w.read()) docs.append(Document(text, name=f, top=40)) m = Model(docs) lsa = m.reduce(5) return lsa # Clustering could be a useful technique, commenting out for now #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: # write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "") with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: for i, concept in enumerate(m.lsa.concepts): print("Concept {0}:".format(i)), w.write(unicode("Concept {0}:".format(i))) count = 0 # Show top only first 5 features we come across for feature, weight in m.lsa.concepts[i].items(): if abs(weight) > 0.2: print(feature), w.write(feature + " ") count += 1 if count > 5: break w.write(unicode('\n')) #print cat_docs = [] for d in m.documents: cat = (0, 0, {}) #print d.name.split('\\')[-1] for idx, weight in m.lsa.vectors[d.id].items(): print "\tCat {0}: {1}".format(idx, weight) if abs(weight) > abs(cat[1]) or cat[1] == 0: cat = (idx, weight, d) if cat[0] == i: cat_docs.append(cat) #print "\t{0}".format(d.name.split('\\')[-1]) cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True) for cat, weight, d in cat_docs: f = d.name.split('\\')[-1] w.write( unicode("\t{0} - {1}\n").format( filter(lambda x: x in string.printable, f), weight))
def asDocumentClass(data, classification): ''' a function that converts list of reviews to Documents to be used by Pattern ''' data = [(r['review/text'], str(classification)) for r in data] data = [ Document(review, type=classification, stopwords=True) for review, classification in data ] return data
def asDocumentReview(data): ''' a function that converts list of reviews to Documents to be used by Pattern ''' data = [(r['review/text'], float(r['review/score'])) for r in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] return data
def run(self, minePackage): ac = 0.0 #acierto clave ap = 0.0 #acierto positivo an = 0.0 #acierto negativo alpha = 1.00 beta = 0.75 gamma = 0.25 dictionary = open(os.path.dirname(__file__) + "/dictionary.txt", 'r').read() dictionary = Document(dictionary, stemmer=PORTER) clouds = minePackage['clouds'] query = minePackage['searchKeyStemmer'] for cloud in clouds: for n in cloud.graph.nodes(): methodData = cloud.graph.node[n]['methodData'] # document=methodData.getData() # for t in document: # tf=document[t] # if t in query: # print "entroooooooooooooooooo" # ac+=tf # else: # if t in dictionary:#creo que me olvide de hacer stemming a las palabras del diccionario # ap+=tf # else: # an+=tf content = Document(methodData.getContent(), stemmer=PORTER) for doc in content.keywords(top=200, normalized=True): if doc[1] in query: ac += doc[0] else: if doc[1] in dictionary.words: ap += doc[0] else: an += doc[0] if ac + ap + an > 0: cloud.graph.node[n]['weight_WA'] = ( (ac * alpha) + (ap * beta) + (an * gamma)) / (ac + ap + an) else: cloud.graph.node[n]['weight_WA'] = 0
def calculate(self, minePackage): webDocuments = [] query = Document((minePackage['searchKey'])) clouds = minePackage['clouds'] count = UnPack() totalLinks = count.total(clouds) urlContent = UrlToPlainText() step = 0 for cloud in clouds: for n in cloud.graph.nodes(): doc = cloud.graph.node[n]['methodData'] webDocuments.append(Document(doc.getData())) step += 1 m = Model(documents=webDocuments, weight=TFIDF) for cloud in clouds: for n in cloud.graph.nodes(): methodData = cloud.graph.node[n]['methodData'] vector = Document(methodData.getData()) cloud.graph.node[n]['weight_VSM'] = m.similarity( vector, query) #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
def summarize(raw_text): if len(raw_text) == 0: return "" sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokens = sentence_tokenizer.tokenize(raw_text.strip()) documents = [] for position, sentence in enumerate(tokens): if len(sentence.split(" ")) > 5: document = Document(string=sentence, name=position, stemmer=LEMMA) if len(document.features) > 0: documents.append(document) edges = [] for document in documents: for other_document in documents: if document.name == other_document.name: continue doc_words = document.features other_doc_words = other_document.features similarity = jaccard_similarity(doc_words, other_doc_words) if similarity > 0: edges.append((document.name, other_document.name, similarity)) graph = networkx.DiGraph() graph.add_weighted_edges_from(edges) page_rank = networkx.pagerank(graph) sorted_ranks = sorted(page_rank.items(), key=operator.itemgetter(1), reverse=True) summary = [] sentence_numbers = [] num_sentences = 3 for i in range(num_sentences): if i < len(sorted_ranks): node = sorted_ranks[i] sentence_numbers.append(node[0]) sentence_numbers = sorted(sentence_numbers) for sentence_number in sentence_numbers: sentence = tokens[sentence_number] summary.append(sentence) if len(summary) == 0: summary.append(tokens[0]) return " ".join(summary)
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.items(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def sync_corpus(self): """Creates a new corpus on all notes if we already have synced before TODO: Store other data in the corpus besides basic text content, ie, extracted image, attribute note data, etc... catch corpus not found file error? """ docs =[] corpus_check = self.mongo.users.find_one({'_id':self.user_id}, {'corpus':1}).get('corpus') # make sure we already created corpus if corpus_check and self.need_sync: update_guids = self.resync_db() corpus = self.load_corpus() # only those that need to be updated from the update_guids for x in self.mongo.notes.find( {'_id':{'$in':update_guids}},{'tokens_content':1,'str_title':1}): # create the updated doc d = Document(x['tokens_content'],name=x['str_title'],top=50) # set the id to what we want d._id = x['_id'] docs.append(d) # remove old doc because corpus will still have old content corpus.remove(d) corpus.extend(docs) self.save_corpus(corpus,update=True) # dont need the sync, do nothing elif corpus_check: return # corpus sync has not been done before else: for x in self.mongo.notes.find( # all notes of this user {'_id_user':self.user_id},{'tokens_content':1,'str_title':1}): d = Document(x['tokens_content'],name=x['str_title'],top=30) d._id = x['_id'] docs.append(d) corpus = Corpus(docs) self.save_corpus(corpus) self.mongo.users.update({'_id':self.user_id},{'$set':{'corpus':True}})
def create_doc_list(df): ''' Given a dataframe containing an 'id' column and a 'review' column, create a list of documents in Pattern.Vector Document format. Because of how the data is formatted in the dataframe, the id contains an extra quote at the beginning and end of the id which need to be stripped away. ''' print "Creating a list of {} documents".format(len(df)) doc_list = [] for index, row in df.iterrows(): d = Document(row['review'], threshold=1, name=row['id'][1:-1]) doc_list.append(d) return doc_list
def classify(text): predicted_category = Classifications._category.classify(Document(text), discrete=True) predicted_rate = Classifications._rating.classify(Document(text), discrete=True) predicted_rate_nlp = Classifications._rating_nlp.classify( Classifications.selectWords(text), discrete=True) predicted_sentiment_dict = Classifications._sentiment.classify( Classifications.selectWords(text), discrete=False) predicted_sentiment = True if str( sorted(predicted_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0' ] else False return { 'text': text, 'rate': predicted_rate, 'category': predicted_category, 'rate_nlp': predicted_rate_nlp, 'positivity': predicted_sentiment }
def extract(): print 'Extracting features from app descriptions...\n' if os.path.exists(OUTPUT_PATH): shutil.rmtree(OUTPUT_PATH) os.makedirs(OUTPUT_PATH) for dir in os.listdir(INPUT_PATH): if not dir.startswith('.'): os.makedirs("{}/{}".format(OUTPUT_PATH, dir)) for file in os.listdir('{}/'.format(INPUT_PATH) + dir): with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f: reader = csv.reader(f) next(reader) with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file), 'wb') as r: writer = csv.writer(r) for app in reader: name = app[0] description = app[2] # Prepare an app description string for NLTK and LDA processing preparedDescription = prepare_description( description) # Extract 3 word featurlets from the description featurelets = featurelet_extraction( preparedDescription) list = [] for feature in featurelets: featurelet = '{} {} {}'.format( feature[0], feature[1], feature[2]) list.append( Document(featurelet, name=featurelet)) # Perform hierarchical clustering m = Model(list) cluster = m.cluster(method=HIERARCHICAL, k=3, iterations=1000, distance=COSINE) # Organize clusters into features and alternative tokens (features, alterTokens) = group(cluster, [], [], []) # Write results to file writer.writerow( [name, description, features, alterTokens]) r.close() f.close()
def get_top_freq_words_in_text(txt_string, top_count, filter_method = lambda w: w.lstrip("'").isalnum(), exclude_len = 0): """ Method to get the top frequency of words in text. Args: txt_string (str): Input string. top_count (int): number of top words to be returned. Kwargs: filter_method (method): special character to ignore, in some cases numbers may also need to ignore. pass in lambda function. Default accept method that include only alphanumeric exclude_len (int): exclude keyword if len less than certain len. default 0, which will not take effect. Returns: (list): list of top words """ docu = Document(txt_string, threshold=1, filter = filter_method) ## Provide extra buffer if there is word exclusion ## Allow for additional buffer of top of keyword so that can still within spec top count after later elimiation. freq_keyword_tuples = docu.keywords(top = top_count + 5 ) ## encode for unicode handliing if exclude_len == 0: freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples] else: freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples if not len(n[1])<=exclude_len] ## reduce all word to same form freq_keyword_list = [get_singular_form_of_word(n) for n in freq_keyword_list] ## remove duplicates freq_keyword_list = rm_duplicate_keywords(freq_keyword_list) return freq_keyword_list[:top_count]
def get_top_freq_words_in_text(txt_string, top_count, filter_method=lambda w: w.lstrip("'").isalnum(), exclude_len=0): """ Method to get the top frequency of words in text. Args: txt_string (str): Input string. top_count (int): number of top words to be returned. Kwargs: filter_method (method): special character to ignore, in some cases numbers may also need to ignore. pass in lambda function. Default accept method that include only alphanumeric exclude_len (int): exclude keyword if len less than certain len. default 0, which will not take effect. Returns: (list): list of top words """ docu = Document(txt_string, threshold=1, filter=filter_method) ## Provide extra buffer if there is word exclusion ## Allow for additional buffer of top of keyword so that can still within spec top count after later elimiation. freq_keyword_tuples = docu.keywords(top=top_count + 5) ## encode for unicode handliing if exclude_len == 0: freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples] else: freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples if not len(n[1]) <= exclude_len] ## reduce all word to same form freq_keyword_list = [get_singular_form_of_word(n) for n in freq_keyword_list] ## remove duplicates freq_keyword_list = rm_duplicate_keywords(freq_keyword_list) return freq_keyword_list[:top_count]
def text_to_database(self, full_text, sha224, title, fluent_anki_session, target_language): language = DBWrapper().get_or_create_language(fluent_anki_session, target_language) #parse text into objects pobj = Parser().parse(full_text) #import pdb; pdb.set_trace() doc = Document(full_text) #create the source text database object source_text_dbobj = SourceText(title = title, hash_text=sha224, text_length=len(pobj.tagged_words)) black_list = DBWrapper().get_black_list_word_set(fluent_anki_session) for word in tqdm(pobj.unique_words): #see if it's on the black list if (word not in black_list) and (not Util().has_numbers(word)): #length of the word should not be null if word: wstf = Word_SourceText_Frequency(frequency = pobj.word_frequency_dict[word]) wstf.text_pos = pobj.unique_parse_words_dict[word].text_pos wstf.tfidf = doc.tfidf(word) wtype = pobj.unique_parse_words_dict[word][0].type w = ExoticWord(text=word, word_type=wtype, lang=language.id) wstf.words = w source_text_dbobj.words.append(wstf) for sentence in pobj.unique_parse_words_dict[word].suggested_sentences: sent = DBWrapper().get_or_create_sentence(fluent_anki_session, sentence.string, target_language) sent.words.append(w) fluent_anki_session.add(sent) #write parsed words to database fluent_anki_session.add(source_text_dbobj) fluent_anki_session.commit()
def get_model_from_documents(path='./*/*.txt'): '''return model from given txt files''' import codecs import glob from pattern.vector import Document, Model, TFIDF documents = [] files = glob.glob('./*/*.*') for file in files: f = codecs.open(file, 'r') data = f.read() document = Document(data) documents.append(document) model = Model(documents=documents, weight=TFIDF) return documents, model
class KeywordWrapper( object ): ''' Non-django model; wrapper around pattern.vector keyword functions. See views.keywords() for usage. ''' def __init__(self): self.time_start = None self.params = {} self.original_text = None self.original_text_md5_hash = None self.document_raw = None self.document_raw_count = None self.document_thresh_stemmed = None self.document_thresh_unstemmed = None self.top_num = 10 self.keywords_stemmed = None self.keywords_unstemmed = None self.keywords_unstemmed_additional = None self.keywords_stemmed_simple = [] self.explore_json_string = None self.simple_json_string = None def get_params( self, dj_request ): assert type(dj_request) == django.core.handlers.wsgi.WSGIRequest if dj_request.method == u'GET': for item in dj_request.GET.items(): key = item[0]; value = item[1] self.params[key] = value else: # POST for item in dj_request.POST.items(): key = item[0]; value = item[1] self.params[key] = value def load_text( self, text ): self.time_start = datetime.datetime.now() self.document_raw = Document( text, threshold=0 ) self.document_raw_count = self.document_raw.count self.document_thresh_stemmed = Document( text, stemmer=PORTER, threshold=1 ) self.document_thresh_unstemmed = Document( text, threshold=1 ) self.original_text = text self.original_text_md5_hash = hashlib.md5(self.original_text.encode(u'utf-8', u'replace')).hexdigest().decode(u'utf-8', u'replace') # takes source-u-string, makes source-string, gets hash-string, makes hash-u-string def set_top_num( self ): assert type(self.document_raw) == pattern.vector.Document for i in range( 1, self.document_raw.count, 1000 ): self.top_num += 1 if self.top_num == 50: break def make_keywords_stemmed_simple( self ): assert type(self.document_thresh_stemmed) == pattern.vector.Document self.keywords_stemmed = self.document_thresh_stemmed.keywords( top=self.top_num ) for kw_tuple in self.keywords_stemmed: score = kw_tuple[0]; word = kw_tuple[1] self.keywords_stemmed_simple.append( word ) def make_default_keywords( self ): '''keywords stemmed & unstemmed''' assert type(self.document_thresh_stemmed) == pattern.vector.Document assert type(self.document_thresh_unstemmed) == pattern.vector.Document self.keywords_stemmed = self.document_thresh_stemmed.keywords( top=self.top_num ) self.keywords_unstemmed = self.document_thresh_unstemmed.keywords( top=self.top_num ) def make_additional_keywords( self ): '''unstemmed words not in stemmed list''' assert type(self.keywords_stemmed) == list if len( self.keywords_stemmed ) > 0: assert type(self.keywords_stemmed[0]) == tuple assert type(self.keywords_unstemmed) == list if len( self.keywords_unstemmed ) > 0: assert type(self.keywords_unstemmed[0]) == tuple ## make simple stemmed keyword list from (score, word) tuple temp_simple_stemmed = [] for kw_tuple in self.keywords_stemmed: score = kw_tuple[0]; word = kw_tuple[1] temp_simple_stemmed.append( word ) ## add any additional unstemmed keywords (whose stems aren't in temp_simple_stemmed ) self.keywords_unstemmed_additional = [] for kw_tuple in self.keywords_unstemmed: score = kw_tuple[0]; word = kw_tuple[1] if word not in temp_simple_stemmed: # TODO: time using sets here instead if stem( word, stemmer=PORTER ) not in temp_simple_stemmed: self.keywords_unstemmed_additional.append( kw_tuple ) def build_explore_json_string( self ): import hashlib d = { u'count_words_raw': len( self.original_text.split() ), u'count_words_analyzed': self.document_raw.count, u'count_words_repeating_stemmed': self.document_thresh_stemmed.count, u'count_words_repeating_unstemmed': self.document_thresh_unstemmed.count, u'count_keywords_stemmed': len( self.keywords_stemmed ), u'count_keywords_unstemmed': len( self.keywords_unstemmed ), u'count_keywords_unstemmed_additional': len( self.keywords_unstemmed_additional ), u'hash_md5': self.original_text_md5_hash, u'keywords_stemmed': self.keywords_stemmed, u'keywords_unstemmed': self.keywords_unstemmed, u'keywords_unstemmed_additional': self.keywords_unstemmed_additional, u'repeating_words_unstemmed': self.document_thresh_unstemmed.terms, u'time_start': unicode( self.time_start ), u'time_taken': unicode( datetime.datetime.now() - self.time_start ), u'docs': app_settings.DOCS_URL } self.explore_json_string = json.dumps( d, sort_keys=True, indent=2 ) def build_simple_json_string( self ): d = { u'count_keywords_stemmed': len( self.keywords_stemmed ), u'keywords_stemmed': self.keywords_stemmed_simple, u'hash_md5': self.original_text_md5_hash, u'time_start': unicode( self.time_start ), u'time_taken': unicode( datetime.datetime.now() - self.time_start ), u'docs': app_settings.DOCS_URL } self.simple_json_string = json.dumps( d, sort_keys=True, indent=2 )
# coding=utf-8 from pattern.vector import Document s = ''' The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the shipʼs external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing fresh supplies and an intelligent robot for the International Space Station. But complications delayed the flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch before its flight window closed this Monday. ''' d = Document(s) print d.keywords(top=10) d._description = 'sample corpus' print d._description print d.term_frequency('flight') print d.tfidf('flight') print d.features print d.words print 'vector = ', d.vector
# e.g., "conspiracies" => "conspiracy", "conspired" => "conspire". s = """ The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the ship's external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing fresh supplies and an intelligent robot for the International Space Station. But complications delayed the flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch before its flight window closed this Monday. """ # With threshold=1, only words that occur more than once are counted. # With stopwords=False, words like "the", "and", "I", "is" are ignored. document = Document(s, threshold=1, stopwords=False) print(document.words) print() # The /corpus folder contains texts mined from Wikipedia. # Below is the mining script (we already executed it for you): #import os, codecs #from pattern.web import Wikipedia # #w = Wikipedia() # for q in ( # "badger", "bear", "dog", "dolphin", "lion", "parakeet", # "rabbit", "shark", "sparrow", "tiger", "wolf"): # s = w.search(q, cached=True) # s = s.plaintext()
### ### to cmd line test this: ### echo "{ \"title\" : \"james muguira\", \"link\" : \"http://rss.cnn.com/rss/cnn_topstories.rss\", \"source\" : \"hello world\", \"data\" : \"{ json }\" }" ### ### for example ### ### insert into table cnn_top select transform (text) ### using 'python map_strm.py' as (title, link, source, data) ### from test; ### import sys import json from pattern.vector import Document from pattern.web import plaintext import urllib2 for line in sys.stdin: line = line.strip() ljs = json.loads(line) fjs = urllib2.urlopen(ljs['link']).read() st = plaintext(fjs) d = Document(st) w = json.dumps(d.keywords()) print "%s\t%s\t%s\t%s" % (ljs['title'], ljs['link'], ljs['source'], w)
# "conspiracy" and "conspired" are both reduced to "conspir". s = """ The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the ship's external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing fresh supplies and an intelligent robot for the International Space Station. But complications delayed the flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch before its flight window closed this Monday. """ # With threshold=1 (default), only words that occur more than once are counted. # Some stop words like "the", "and", "I", "is" are always ignored. document = Document(s, threshold=1) print document.terms print # The corpus/ folder contains some texts retrieved from Wikipedia. # Here is the code (we already executed it for you): #from pattern.web import Wikipedia # #wp = Wikipedia() #for q in ( # "badger", "bear", "dog", "dolphin", "lion", "parakeet", # "rabbit", "shark", "sparrow", "tiger", "wolf"): # s = wp.search(q, cached=True) # s = s.plaintext() # f = codecs.open(os.path.join("corpus", q+".txt"), "w", encoding="utf-8")