def compare(origin_article_obj, tgt_article_objs): tgt_paragraph_docs = [] tgt_grafs = [] for obj in tgt_article_objs: if obj['paragraphs'] is not None: for graf in obj['paragraphs']: tgt_grafs.append({ 'text': graf, 'url': obj['url'], 'img': obj['img_src'], 'title': obj['title'] }) tgt_paragraph_docs.append(Document(graf, description=obj['url'])) origin_graf_doc = Document(' '.join(origin_article_obj['paragraphs']), description='origin') m = Model(documents=tgt_paragraph_docs+[origin_graf_doc], weight=TFIDF) tgts_by_dist = sorted(range(len(tgt_paragraph_docs)), key=lambda i: m.similarity(origin_graf_doc, tgt_paragraph_docs[i])) furthest = map(lambda i: tgt_grafs[i], tgts_by_dist) furthest_unique = [] for entry in furthest[::-1]: if any([obj['url'] == entry['url'] or obj['text'] == entry['text'] for obj in furthest_unique]): pass else: furthest_unique.append(entry) if len(furthest_unique) >= 10: return furthest_unique[:10] else: return furthest_unique
def articles_to_trends(articles): news = {} for story in articles: if story['added_at']: article_text = get_article_text(story['url']) d, s = timestamptext(story['added_at'], article_text) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story['summary']) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10)
def calculate(self,minePackage,progress): webDocuments=[] query=Document((minePackage['searchKey'])) clouds=minePackage['clouds'] count=UnPack() totalLinks=count.total(clouds) progress.set_totalIR(totalLinks)#Total de documentos a recuperar progress.set_IRState('Ejecutando')#Actualiza el estado del proceso urlContent=UrlToPlainText() step=0 for cloud in clouds: if not progress.get_stop(): for n in cloud.graph.nodes(): if not progress.get_stop(): doc=cloud.graph.node[n]['methodData'] webDocuments.append(Document(doc.getData())) step+=1 progress.set_IRProgress(step)#Progreso del proceso paso a paso else: break else: break if not progress.get_stop(): m=Model(documents=webDocuments, weight=TFIDF) for cloud in clouds: for n in cloud.graph.nodes(): methodData=cloud.graph.node[n]['methodData'] vector=Document(methodData.getData()) cloud.graph.node[n]['weight_VSM']=m.similarity(vector,query)
def crearIndiceInvertidoCuerpoMensajes(): cuerpo1=''' Estimados socios: ya hemos firmado el contrato de compraventa con el cliente preferencial. Espero noticias vuestras. Un saludo, ''' cuerpo2=''' Estimados Antonio: agradezco mucho tus buenas noticias, aunque me temo que el documento que debe adjuntarse al contrato se va a retrasar unos dias. Un saludo, ''' cuerpo3=''' Estimados socios: aunque el contrato no este legalizado aun, me he permitido hacer una transferencia por la mitad del importe al contratista. Un saludo, ''' cuerpo4=''' Estimados socios: muchas gracias por las gestiones. se lo comunicare al cliente hoy mismo. Un saludo, ''' cuerpo5=''' Estimado Luis: ya hemos realizado una transferencia a su cuenta por el importe establecido inicialmente. Un saludo, ''' cuerpo6=''' Un saludo, ''' correo1 = Document(cuerpo1, name="correo1",threshold=0,stopwords = True,language = 'es') correo2 = Document(cuerpo2, name="correo2",threshold=0,stopwords = True,language = 'es') correo3 = Document(cuerpo3, name="correo3",threshold=0,stopwords = True,language = 'es') correo4 = Document(cuerpo4, name="correo4",threshold=0,stopwords = True,language = 'es') correo5 = Document(cuerpo5, name="correo5",threshold=0,stopwords = True,language = 'es') correo6 = Document(cuerpo6, name="correo6",threshold=0,stopwords = True,language = 'es') modeloCorreos = Model(documents=[correo1,correo2,correo3,correo4,correo5,correo6], weight=TFIDF) correos = modeloCorreos.documents informacion = "" for correo in correos: palabras = correo.features for palabra in palabras: informacion +="Palabra: "+str(palabra) +" => índice invertido: "+str(modeloCorreos.idf(palabra))+", correo: "+correo.name+"\n" print informacion
def word_ranking(text, n='L2'): """ extract most relevant sentences from text according to LSA algorithm steps: 1. tokenize text by sentences 2. compute tfidf matrix 3. applying SVD of tfidf matrix (reduce to n-dimensions) 4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf) - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) #============================================================================== # #synctatic filter # exclude_list = [] # for sent in sentences: # for word, pos in tag(sent): # if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns. # exclude_list.append(word.lower()) #============================================================================== # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dimensions number equal to euclidean norm of singular values # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False) # dimensions=int(round(np.linalg.norm(S, 2))) m.reduce(dimensions=n) # sentences selection according to cross-method # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf # topic(rows) x tokens(cols) matrix(tfidf) V = np.array(m.lsa.vt) # average sentence score for each concept/topic by the rows of the Vt matrix avg_score = np.mean(V, axis=1).reshape((-1, 1)) # cell values which are less than or equal to the average score are set to zero V[V <= avg_score] = 0.0 # sigma natrix after svd performing S = np.array(m.lsa.sigma).reshape((-1, 1)) # total length of each sentence vector length = np.sum(V * S, axis=0) # ranking words by length score ranking = Counter(dict(zip(m.lsa.terms, length))) #.most_common(n) #words, score = list(zip(*ranking)) return ranking
def feeds_to_trends(feeds): for url in feeds: url = url['feed_url'] news = {} try: for story in Newsfeed().search(url, cached=False): d, s = datetext(story.date, story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10) except HTTP404NotFound: print url pass
def getMod(): essay_path = 'essays/original/' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] for f in files: with io.open(f, 'r', encoding='utf-8') as w: text = TextBlob(PageParser.parse(w.read())) text = ' '.join([ word for word in text.words if word not in cachedStopWords ]).lstrip() #ent_text = ' '.join(er.recognize_entities(text.sentences)) #ent_text = PageParser.parse(w.read()) docs.append(Document(text, name=f, top=40)) m = Model(docs) lsa = m.reduce(5) return lsa # Clustering could be a useful technique, commenting out for now #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: # write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "") with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: for i, concept in enumerate(m.lsa.concepts): print("Concept {0}:".format(i)), w.write(unicode("Concept {0}:".format(i))) count = 0 # Show top only first 5 features we come across for feature, weight in m.lsa.concepts[i].items(): if abs(weight) > 0.2: print(feature), w.write(feature + " ") count += 1 if count > 5: break w.write(unicode('\n')) #print cat_docs = [] for d in m.documents: cat = (0, 0, {}) #print d.name.split('\\')[-1] for idx, weight in m.lsa.vectors[d.id].items(): print "\tCat {0}: {1}".format(idx, weight) if abs(weight) > abs(cat[1]) or cat[1] == 0: cat = (idx, weight, d) if cat[0] == i: cat_docs.append(cat) #print "\t{0}".format(d.name.split('\\')[-1]) cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True) for cat, weight, d in cat_docs: f = d.name.split('\\')[-1] w.write( unicode("\t{0} - {1}\n").format( filter(lambda x: x in string.printable, f), weight))
def bag_of_words_tfidf(lst): ''' Constructs a bag of words model, where each document is a Facebook post/comment Also applies TFIDF weighting, lemmatization, and filter out stopwords ''' model = Model(documents=[], weight=TFIDF) for msg, link in lst: doc = Document(msg, stemmer=LEMMA, stopwords=True, name=msg, description=link) model.append(doc) return model
def getMod(): essay_path = 'essays/original/' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] for f in files: with io.open(f, 'r', encoding='utf-8') as w: text = TextBlob(PageParser.parse(w.read())) text = ' '.join([word for word in text.words if word not in cachedStopWords]).lstrip() #ent_text = ' '.join(er.recognize_entities(text.sentences)) #ent_text = PageParser.parse(w.read()) docs.append(Document(text, name=f, top=40)) m = Model(docs) lsa = m.reduce(5) return lsa # Clustering could be a useful technique, commenting out for now #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: # write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "") with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: for i,concept in enumerate(m.lsa.concepts): print("Concept {0}:".format(i)), w.write(unicode("Concept {0}:".format(i))) count = 0 # Show top only first 5 features we come across for feature, weight in m.lsa.concepts[i].items(): if abs(weight) > 0.2: print(feature), w.write(feature + " ") count += 1 if count > 5: break w.write(unicode('\n')) #print cat_docs = [] for d in m.documents: cat = (0,0, {}) #print d.name.split('\\')[-1] for idx,weight in m.lsa.vectors[d.id].items(): print "\tCat {0}: {1}".format(idx, weight) if abs(weight) > abs(cat[1]) or cat[1] == 0: cat = (idx,weight,d) if cat[0] == i: cat_docs.append(cat) #print "\t{0}".format(d.name.split('\\')[-1]) cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True) for cat,weight,d in cat_docs: f = d.name.split('\\')[-1] w.write(unicode("\t{0} - {1}\n").format(filter(lambda x: x in string.printable, f), weight))
def runTFIDF(): """Given a list of classes, construct a Vector-Space Model. We only need to do it once and save it to a pickle file for fast loading later on""" model = Model(documents=[], weight=TFIDF) for r, d, files in os.walk("project/data/"): for f in files: if f.endswith(".txt"): text = readFile(f) doc = Document(text, stemmer=LEMMA, stopwords=True, name=f.replace(".txt", "")) model.append(doc) model.save("project/pickle/course.pic")
def extract(): print 'Extracting features from app descriptions...\n' if os.path.exists(OUTPUT_PATH): shutil.rmtree(OUTPUT_PATH) os.makedirs(OUTPUT_PATH) for dir in os.listdir(INPUT_PATH): if not dir.startswith('.'): os.makedirs("{}/{}".format(OUTPUT_PATH, dir)) for file in os.listdir('{}/'.format(INPUT_PATH) + dir): with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f: reader = csv.reader(f) next(reader) with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file), 'wb') as r: writer = csv.writer(r) for app in reader: name = app[0] description = app[2] # Prepare an app description string for NLTK and LDA processing preparedDescription = prepare_description( description) # Extract 3 word featurlets from the description featurelets = featurelet_extraction( preparedDescription) list = [] for feature in featurelets: featurelet = '{} {} {}'.format( feature[0], feature[1], feature[2]) list.append( Document(featurelet, name=featurelet)) # Perform hierarchical clustering m = Model(list) cluster = m.cluster(method=HIERARCHICAL, k=3, iterations=1000, distance=COSINE) # Organize clusters into features and alternative tokens (features, alterTokens) = group(cluster, [], [], []) # Write results to file writer.writerow( [name, description, features, alterTokens]) r.close() f.close()
def compute_topics(set_reduce_topics, today): # Based on similarity # Based on words cleanup_topic(today.day, today.month, today.year) ScrapedTopicGroups.sync() sites = SiteNewsScrapedData.objects.all() documents = [] for site in sites: for sentence in site.content.split('.'): if sentence: tree = parsetree(sentence, lemmata=True) if len(tree) > 0: documents.append(tree[0]) documents = [[w.lemma for w in document if w.tag.startswith((u'NN', u'NNS', u'NNP', u'NNPS')) and w.lemma not in settings.STOP_WORDS] for document in documents] documents = [Document(" ".join(document) + '.') for document in documents if len(document) > 1] model = Model_Comp(documents=documents) # format: (distribution, Document) documents_analyzed = [] for document in documents: tokens = [] similar_items_news = model.nearest_neighbors(document) for similarity, sim_document in similar_items_news: if similarity > 0.95 and sim_document.id not in documents_analyzed: tokens.extend([word for word, _ in sim_document.words.iteritems()]) documents_analyzed.append(sim_document.id) # Added is there some document similar if document.id not in documents_analyzed: tokens.extend([word for word, _ in document.words.iteritems()]) documents_analyzed.append(document.id) # filter the most relevant words (based on count) counter = defaultdict(int) for token in tokens: counter[token] += 1 # Order counter desc tokens_org = sorted(counter.items(), key=lambda element: element[1], reverse=True) tokens = [token for token, count in tokens_org[:3]] if tokens and len(tokens) > 0: links = SiteNewsScrapedData.find_coincidences(tokens) # Filtrar solamente si tiene mas de 3 links if len(links) > 3: ScrapedTopicGroups.create(tags=tokens, links=links, relevance=len(links), day=today.day, month=today.month, year=today.year) if set_reduce_topics: reduce_topics(today.day, today.month, today.year) return True
def summarize(text, n=1): """ extract most relevant sentences from text according to TextRank algorithm - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dict of TextRank ranking of cosine similarity matrix ranking = utils.textrank(m.documents, m.distance) # indexes of top n sentences top_sents_idx, _ = list(zip(*ranking.most_common(n))) # reordering output = [sentences[i] for i in sorted(top_sents_idx)] return ''.join(output)
def create_model(doc_list): ''' Given a list of documents in Pattern.Vector Document format, create a Pattern.Vector Model. ''' print "Creating a TFIDF model for {} documents".format(len(doc_list)) return Model(documents=doc_list, weight=TFIDF)
def build_model(results=[]): documents = [ Document(i.get('text'), name=i.get('url'), description=i.get('index'), stemmer=LEMMA) for i in results ] m = Model(documents, weight=TFIDF) y, x = 1, len(m.features) model = np.zeros((y, x)) sentence_dict = {} model_sentences = [] for i_index, i in enumerate(documents): sentences = sent_tokenize(results[i_index].get('text').lower()) dy, dx = len(sentences), x for s_index, s in enumerate(sentences): s_words = { w: 1 for w in words(s, stemmer=LEMMA, stopwords=False) if not stopwords_hash.get(w) } if len(s_words) < 5: continue model_sentences.append(s) model = np.append( model, [[1 if s_words.get(w) else 0 for w in m.features]], 0) sentence_dict[model.shape[0] - 1] = i.name # model_sentences[model.shape[0]-1] = s model = np.delete(model, (0), 0) return model, m, model_sentences, sentence_dict
def modeling(self, descriptions, field = False, limit = False): """Model returns a pattern.vector.Model object which is a list of pattern.vector.Document using Ehri.Get() descriptions Keyword arguments: descriptions --- EHRI.get() description object field --- Field to look into, override defaut self.field limit --- Debug option. Limit the model to $limit items """ if field: self.field = field if limit: self.limit = limit D = [] #Creating Pattern Document element from data we got from Neo4J # #For debug reasons, we could set a limit if self.limit: i = 0 for description in descriptions: D.append(Document(description[self.field], name=description[self.identifier])) #And stop the iteration when i reaches the limit if self.limit: i += 1 if i == self.limit: break #Then, creating a model from our array self.model = Model(D) return self.model
def runTFIDFOnSchedule(term=util.currentTerm, year=util.currentYear): """ Given a list of classes, construct a Vector-Space model and apply the TFIDF algorithm to measure similarity between courses """ model = Model(documents=[], weight=TFIDF) print "Loading from pickle file..." allCourses = loadAllCoursesInTerm() print "Begin constructing the Vector Space model" for course in allCourses: text = course.title + " " + course.description doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\ description=course) model.append(doc) print "Finish processing!!!" with open("pickle/simCourses" + term + year + ".pickle", "w") as f: dump(model, f, 0) return model
def buscaCorreo2(x): documents = [] documap = {} for archivo in os.listdir("Correos"): if archivo.endswith(".txt"): f = open("Correos/" + archivo, "r") f.readline() f.readline() f.readline() f.readline() mailbody = f.read() f.close() docu = Document(mailbody, name=archivo) documents.append(docu) docukey = int(archivo[0:-4]) documap[docukey] = docu model = Model(documents=documents, weight=TFIDF) docu = documap[int(var.get())] tupla = model.neighbors(docu, top=1)[0] tkMessageBox.showinfo("Tk", "El documento que mas se parece es el " + tupla[1].name[0:-4] + ", con un " + str(tupla[0]) + " de similitud")
def calculate(self, minePackage): webDocuments = [] query = Document((minePackage['searchKey'])) clouds = minePackage['clouds'] count = UnPack() totalLinks = count.total(clouds) urlContent = UrlToPlainText() step = 0 for cloud in clouds: for n in cloud.graph.nodes(): doc = cloud.graph.node[n]['methodData'] webDocuments.append(Document(doc.getData())) step += 1 m = Model(documents=webDocuments, weight=TFIDF) for cloud in clouds: for n in cloud.graph.nodes(): methodData = cloud.graph.node[n]['methodData'] vector = Document(methodData.getData()) cloud.graph.node[n]['weight_VSM'] = m.similarity( vector, query) #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
def rankingSVM(self, listaUrls, consulta, parametros): """ metodo para rankear una lista de urls mediante el algoritmo RSVM Entrada: listaUrls: lista de los urls para rankear consulta: consulta de busqueda en cadena de caracteres parametros: parametros Salida: lista de urls rankeados """ self.preprocesamiento.lecturaSVMRanking(listaUrls, consulta) """ creacion de atributos para cada enlace""" listaUrls = self.setearAtributosRanking(listaUrls, consulta) """se obtiene los puntos para realizar el ranking""" puntos = self.getAtributosRanking(listaUrls, consulta.name) X = np.array(puntos['X']) svmNorelevante = joblib.load('Model/SVM/norelevante.pkl') svmRelevante = joblib.load('Model/SVM/relevante.pkl') svmMuyrelevante = joblib.load('Model/SVM/muyrelevante.pkl') prediccionesNoRelevante = svmNorelevante.predict(X) prediccionesRelevante = svmRelevante.predict(X) prediccionesMuyRelevante = svmMuyrelevante.predict(X) listaUrls = self.preprocesamiento.limpiarListaUrls( listaUrls, puntos['name']) ranking = [] modeloLista = [] for url in listaUrls: documento = self.mongodb.getDocumento(url) if documento: documentoPattern = self.preprocesamiento.getDocumentoPattern( documento['_id']) modeloLista.append(documentoPattern) unModelo = Model(modeloLista) """calculo del puntaje de ranking SVM""" for indice, doc in enumerate(unModelo): url = doc.name documento = {} documento['url'] = url documento['score'] = ( 1 - self.obtenerVectorSpaceModel(doc, consulta)) + ( prediccionesNoRelevante[indice] + prediccionesRelevante[indice] * parametros[1] + prediccionesMuyRelevante[indice] * parametros[2]) ranking.append(documento) listaNueva = sorted(ranking, key=lambda k: k['score'], reverse=True) return listaNueva
def kmeansCluster(self, documentList, k, iteration, distance, seed, p): if distance.lower() == "cosine": distance = COSINE elif distance.lower() == "euclidean": distance = EUCLIDEAN elif distance.lower() == "manhattan": distance = MANHATTAN else: return "invalid distance" if seed.lower() == "kmpp": seed = KMPP elif seed.lower() =="random": seed = RANDOM else: return "invalid random" if type(k) is not int: return "k is not int" if type(iteration) is not int: return "iterartion is not int" if type(p) is not float and type(p) is not int: return "p is not float" if type(documentList) is not list: return "document List is not list" self.iteration = iteration self.seed = seed self.p = p self.distance = distance model = Model(documentList) cluster = model.cluster(method=KMEANS, k=k, iterations=iteration, distance=distance,seed=seed,p=p) return cluster
def runTFIDFOnCatalog(term=util.currentTerm, year=util.currentYear): """ Given a dictionary of courses, construct a Vector-Space model and apply the TFIDF algorithm to measure similarity between courses. We only need to do it once and save it to a pickle file for fast loading later on """ model = Model(documents=[], weight=TFIDF) print "Loading from pickle file..." allCoursesDict = loadCourseCatalog() for dept in allCoursesDict: print "Processing department", dept for course in allCoursesDict[dept]: text = course.title + " " + course.description doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\ description=course) model.append(doc) print "Finish processing", dept, "\n" with open("pickle/simCatalog" + term + year + ".pickle", "w") as f: dump(model, f, 0) return model
def get_model_from_documents(path='./*/*.txt'): '''return model from given txt files''' import codecs import glob from pattern.vector import Document, Model, TFIDF documents = [] files = glob.glob('./*/*.*') for file in files: f = codecs.open(file, 'r') data = f.read() document = Document(data) documents.append(document) model = Model(documents=documents, weight=TFIDF) return documents, model
def GetVectors(): essay_path = 'training' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] percepticon = PerceptronTagger() cat_dict = defaultdict(int) for f in files: extended_text = ExtendText(f, percepticon) name = '' cats = ['high', 'medium', 'low'] for cat in cats: if cat in f: name = cat + str(cat_dict[cat]) cat_dict[cat] += 1 docs.append(Document(extended_text, name=name, top=None)) m = Model(docs) #lsa = m.reduce(5) return m
def r2iterator_to_model(collection, query): r2_list = [] for r2 in collection.get_by_example(query): try: strings = [r2['program_desc']] for projid in r2['projects'].keys(): try: strings.append(r2['projects'][projid]['mission_desc']) except KeyError: pass try: doc = Document(" ".join(strings), name=r2['_id']) r2_list.append(doc) except TypeError as e: print repr(e) print r2['_id'] except KeyError as e: print repr(e) return Model(r2_list)
def rankingVectorSpaceModel(self, listaUrls, consulta): """metodo para el ranking mediante VSM Entrada: Consulta de busqueda en string, y lista de urls Salida: lista final rankeado""" listaUrlsRankeados = [] listaModel = [] for url in listaUrls: documento = self.mongodb.getDocumento(url) if documento: documentoPattern = self.preprocesamiento.getDocumentoPattern( documento['_id']) listaModel.append(documentoPattern) unModelo = Model(listaModel, weight=TFIDF) for unDocumento in unModelo: score = self.svm.calcularVectorSpaceModel(consulta, unDocumento) listaUrlsRankeados.append( self.crearJsonRanking(unDocumento.name, score)) listaFinal = sorted(listaUrlsRankeados, key=lambda k: k['score'], reverse=False) return listaFinal
from pattern.en import Sentence, parse from pattern.search import search from pattern.vector import Document, Model, KNN # Classification is a supervised machine learning method, # where labeled documents are used as training material # to learn how to label unlabeled documents. # This example trains a simple classifier with Twitter messages. # The idea is that, if you have a number of texts with a "type" # (mail/spam, positive/negative, language, author's age, ...), # you can predict the type of other "unknown" texts. # The k-Nearest Neighbor algorithm classifies texts according # to the k documents that are most similar (cosine similarity) to the given input document. m = Model() t = Twitter() # First, we mine a model of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in t.search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None))
def main(): ############################################################################################## print('QUESTION 1, Part I: Web Crawling: Extraction of Book Titles') print("-" * 70) print('\n') print( 'Retrieving Book Titles from the first two pages of Amazon search results! \n' ) print('Please wait a minute... \n') print("~" * 70) #open the base URL webpage level_1_url = "https://www.amazon.com/s?url=search-alias%3Daps&field-keywords=Martin+Heidegger" all_titles = get_titles(level_1_url) #print with text wrapping format = '%s' pieces = [format % (ttl) for ttl in all_titles] output = ' | '.join(pieces) ttls = fill(output) print('The scraped book titles are:') print("_" * 40) print('\n') print('\n\n'.join(ttls.split('|'))) print('\n') ############################################################################################## print( 'QUESTION 1, Part II: Pairwise Text Cosine Similarity Scores of Book Titles' ) print("-" * 70) print('\n') doc_list = [] for i in range(len(all_titles)): doc_list.append( Document(all_titles[i], type=" ".join(all_titles[i].split()))) m = Model(documents=doc_list, weight=TFIDF) cos_similarities = [(m.similarity(x, y), m.documents[i].type, m.documents[j].type) for i, x in enumerate(m.documents) for j, y in enumerate(m.documents) if i != j] unique_cos_sim = [ tuple(x) for x in set(map(frozenset, cos_similarities)) if len(tuple(x)) == 3 ] resorted_cos_sim_ttl = [] for i in range(len(unique_cos_sim)): resorted_cos_sim_ttl.append( sorted(tuple(str(e) for e in unique_cos_sim[i]))) resorted_cos_sim_ttl[i][0] = float(resorted_cos_sim_ttl[i][0]) resorted_cos_sim_ttl[i] = tuple(resorted_cos_sim_ttl[i]) print( 'The number of calculated book title cosine similarity scores is: {} \n' .format(len(resorted_cos_sim_ttl))) print( 'All non-zero book title cosine similarity scores, from smallest to largest: \n' ) for tup in sorted(resorted_cos_sim_ttl): if tup[0] != 0: print(tup[0]) print('\n') print("~" * 70) #print with text wrapping format = '%s' pieces = [ format % (sim, ) for sim in sorted( resorted_cos_sim_ttl, key=lambda t: t[0], reverse=True)[:5] ] output = ' | '.join(pieces) sims = fill(output) print( 'The cosine similarity scores of the five most similar book titles are: \n' ) print('\n\n'.join(sims.split('|'))) print('\n') print("~" * 70) pieces = [ format % (sim, ) for sim in sorted( resorted_cos_sim_ttl, key=lambda t: t[0], reverse=False)[:5] ] output = ' | '.join(pieces) sims = fill(output) print( 'The cosine similarity scores of the five most dissimilar book titles are: \n' ) print('\n\n'.join(sims.split('|'))) print('\n') ############################################################################################# print( 'QUESTION 1, Part III: Most Similar and Dissimilar Book Titles and Search Rankings' ) print("-" * 70) print('\n') print('The most similar pair of book titles is: \n') print(max(resorted_cos_sim_ttl)) print('\n') print('The most dissimilar pair of book titles is: \n') print(min(resorted_cos_sim_ttl)) print('\n') print("~" * 70) doc_types = [doc.type for doc in m.documents] print( 'The search ranking of the first element of the most similar book title pair is: \n' ) print(doc_types.index(max(resorted_cos_sim_ttl)[1])) print('\n') print( 'The search ranking of the second element of the most similar book title pair is: \n' ) print(doc_types.index(max(resorted_cos_sim_ttl)[2])) print('\n') print( 'The search ranking of the first element of the most dissimilar book title pair is: \n' ) print(doc_types.index(min(resorted_cos_sim_ttl)[1])) print('\n') print( 'The search ranking of the second element of the most dissimilar book title pair is: \n' ) print(doc_types.index(min(resorted_cos_sim_ttl)[2])) print('\n') ############################################################################################# print('QUESTION 2, Part I: Web Crawling: Extraction of Search Capsules') print("-" * 70) print('\n') orig_query = 'Ponderings XII–XV: Black Notebooks 1939–1941 (Studies in Continental Thought)' level_1_url = "https://www.google.com/search?q=" + orig_query.replace( ' ', '+') all_capsules = get_capsules(level_1_url) all_capsules_clean = [] for cp in all_capsules: all_capsules_clean.append( unicodedata.normalize('NFKD', cp).encode('ascii', 'ignore').decode('utf-8')) #print with text wrapping format = '%s' pieces = [format % (cap) for cap in all_capsules_clean] output = ' | '.join(pieces) caps = fill(output) print('The scraped capsules are:') print("_" * 40) print('\n') print('\n\n'.join(caps.split('|'))) print('\n') ############################################################################################## print( 'QUESTION 2, Part II: Pairwise Text Cosine Similarity Scores of Search Capsules' ) print("-" * 70) print('\n') query_list = [] for i in range(len(all_capsules_clean)): query_list.append( Document(all_capsules_clean[i], type=" ".join(all_capsules_clean[i].split()))) m = Model(documents=query_list, weight=TFIDF) cos_similarities = [(m.similarity(x, y), m.documents[i].type, m.documents[j].type) for i, x in enumerate(m.documents) for j, y in enumerate(m.documents) if i != j] unique_cos_sim = [ tuple(x) for x in set(map(frozenset, cos_similarities)) if len(tuple(x)) == 3 ] resorted_cos_sim_caps = [] for i in range(len(unique_cos_sim)): resorted_cos_sim_caps.append( sorted(tuple(str(e) for e in unique_cos_sim[i]))) resorted_cos_sim_caps[i][0] = float(resorted_cos_sim_caps[i][0]) resorted_cos_sim_caps[i] = tuple(resorted_cos_sim_caps[i]) print( 'The number of calculated capsule cosine similarity scores is: {} \n'. format(len(resorted_cos_sim_caps))) print( 'All non-zero capsule cosine similarity scores, from smallest to largest: \n' ) for tup in sorted(resorted_cos_sim_caps): if tup[0] != 0: print(tup[0]) print('\n') print("~" * 70) #print with text wrapping format = '%s' pieces = [ format % (sim, ) for sim in sorted( resorted_cos_sim_caps, key=lambda t: t[0], reverse=True)[:5] ] output = ' | '.join(pieces) sims = fill(output) print( 'The Cosine Similarity scores of the five most similar capsule pairs are: \n' ) print('\n\n'.join(sims.split('|'))) print('\n') print("~" * 70) pieces = [ format % (sim, ) for sim in sorted( resorted_cos_sim_caps, key=lambda t: t[0], reverse=False)[:5] ] output = ' | '.join(pieces) sims = fill(output) print( 'The Cosine Similarity scores of the five most dissimilar capsule pairs are: \n' ) print('\n\n'.join(sims.split('|'))) print('\n') print("~" * 70) print( 'Finding the capsule with the highest cosine similarity to the original query... \n' ) all_capsules_clean.append(orig_query) caps_and_query = [] for i in range(len(all_capsules_clean)): caps_and_query.append( Document(all_capsules_clean[i], type=" ".join(all_capsules_clean[i].split()))) m = Model(documents=caps_and_query, weight=TFIDF) cos_similarities = [(m.similarity(x, y), m.documents[i].type, m.documents[j].type) for i, x in enumerate(m.documents) for j, y in enumerate(m.documents) if i != j] unique_cos_sim_query = [ tuple(x) for x in set(map(frozenset, cos_similarities)) if len(tuple(x)) == 3 ] resorted_cos_sim_query = [] for i in range(len(unique_cos_sim_query)): resorted_cos_sim_query.append( sorted(tuple(str(e) for e in unique_cos_sim_query[i]))) resorted_cos_sim_query[i][0] = float(resorted_cos_sim_query[i][0]) resorted_cos_sim_query[i] = tuple(resorted_cos_sim_query[i]) result_list = [] for tup in resorted_cos_sim_query: if orig_query in tup: result_list.append(tup) result_tup = max(result_list, key=lambda x: x[0]) print( 'The cosine similarity score of the capsule most similar to the original query is: \n' ) print(result_tup) print('\n') print( 'Finding search ranking of the capsule with the highest cosine similarity to the original query... \n' ) match_list = [] for item in all_capsules_clean: match_list.append(item.replace('\n', '')) print( 'The search ranking of the capsule most similar to the original query is: \n' ) print(match_list.index(result_tup[1])) print('\n') ############################################################################################# print( 'QUESTION 2, Part III: Most Similar and Dissimilar Capsules and Search Rankings' ) print("-" * 70) print('\n') print('The most similar pair of capsules is: \n') print(max(resorted_cos_sim_caps)) print('\n') print('The most dissimilar pair of capsules is: \n') print(min(resorted_cos_sim_caps)) print('\n') print("~" * 70) doc_types = [doc.type for doc in m.documents] print( 'The search ranking of the first element of the most similar capsule pair is: \n' ) print(doc_types.index(max(resorted_cos_sim_caps)[1])) print('\n') print( 'The search ranking of the second element of the most similar capsule pair is: \n' ) print(doc_types.index(max(resorted_cos_sim_caps)[2])) print('\n') print( 'The search ranking of the first element of the most dissimilar capsule pair is: \n' ) print(doc_types.index(min(resorted_cos_sim_caps)[1])) print('\n') print( 'The search ranking of the second element of the most dissimilar capsule pair is: \n' ) print(doc_types.index(min(resorted_cos_sim_caps)[2])) print('\n') ############################################################################################ print('Summary Report: Document Similarity Semantic Analysis') print("-" * 70) ################ report = "A crawler with changing user-agent headers was used to scrape book titles on Amazon from the first two pages of results returned when searching the philosopher, Martin Heidegger. Using TF-IDF values derived from a model incorporating the scraped results, all pairwise cosine similarity scores were calculated for the corpus documents, each of which consisted of the book title and any accompanying subtitle text. The scores were filtered for unique book title pairs and sorted by ascending cosine similarity score, so the top 5 and bottom 5 pairs could be printed in terminal. As several pairings returned a cosine similarity score of 0, the most dissimilar pair among the lowest scores could not be decisively quantified. Interestingly, search rankings of the elements of the most similar and dissimilar pairs did not appear on the same page of results. Another crawler was used to scrape capsules returned by a Google search for one of the book titles appearing in the Amazon results. Capsules from the first three pages of Google results were Unicode normalized and decoded before they were incorporated into another model, from which TF-IDF values were derived. All pairwise cosine similarity scores were calculated for the new set of corpus documents, which consisted of all text appearing in each capsule. Scores were filtered for unique capsule pairs and sorted by ascending cosine similarity score; the top 5 and bottom 5 pairs were again printed in terminal. To identify the capsule most similar to the original query, the latter was then included in the model, from which a new set of TF-IDF values and cosine similarity scores were generated. Interestingly, the ranking of the most similar capsule appeared lower in the search results than expected, on the bottom of the second page. Intuitively, the search rankings of the capsules most similar to one another did, however, appear on the same page of Google results." ############## format = '%s' pieces = [format % (word) for word in report] output = ''.join(pieces) write_up = fill(output) print(write_up) return None
# and filters out noise, so that semantically related words come out stronger. # We'll use the Pang & Lee corpus of movie reviews, included in the testing suite. # Take 250 positive reviews and 250 negative reviews: data = os.path.join("..","..","test", "corpora", "polarity-en-pang&lee.csv") data = Datasheet.load(data) data = data[:250] + data[-250:] # Build a model of movie reviews. # Each document consists of the top 40 words in the movie review. documents = [] for score, review in data: document = Document(review, stopwords=False, top=40, type=int(score) > 0) documents.append(document) m = Model(documents) print "number of documents:", len(m) print "number of features:", len(m.vector) print "number of features (average):", sum(len(d.features) for d in m.documents) / float(len(m)) print # 6,337 different features may be too slow for some algorithms (e.g., hierarchical clustering). # We'll reduce the document vectors to 10 concepts. # Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents.
con = pymongo.MongoClient() sentiment_res = con.tweets.sentiment_analysis sentiment_res_p = con.tweets.patterns_sentiment_analysis tweets = con.tweets.tweets_toronto docs = [] # with open('D:\\data\\documents.spkl', 'wb') as fp: # for tweet in tweets.find(): # doc = Document(tweet['text'],name=tweet['id']) # pickle.dump(doc, fp) # fp.close() # m = Model(documents=[],weight=TFIDF) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in range(tweets.count()/100): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in xrange(tweets.count()): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) print len(m.documents) m.reduce(dimensions=L2) m.save
def load_model(filename): ''' Given a path/filename, load the Pattern.Vector model from that filename. ''' print "Loading model from file {}".format(filename) return Model.load(filename)
news, url = {}, 'http://news.google.com/news?output=rss' for story in Newsfeed().search(url, cached=False): d = str(date(story.date, format='%Y-%m-%d')) s = plaintext(story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate content. news.setdefault(d, {})[hash(s)] = s # Your code will probably have some preprocessing steps to save and load the mined news updates. m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=['news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10)
def get_results(query, quantity, force=False, news=False, analysis=True): query = query.lower() start = datetime.now() query = query.replace('_', '%20') breakdown = 50 if breakdown > quantity: breakdown = quantity data_to_be_written = [] knowledgeKeywords = [] duplicates = [] results, created = webSearch.objects.get_or_create(queryText=query.strip()) if created or force or len(results.results.all()) < quantity: all_results = getGoogleResults(query, quantity, news, force) else: all_results = [] if len(all_results) == 0 and not created: all_results = [r.url for r in results.results.all()] all_results = all_results[:quantity] print "TOTAL RESULTS ", str(len(all_results)) # Done with getting search results for index, i in enumerate(all_results): try: wr, created = WebResource.objects.get_or_create(url=i) if created: wr = parseURL(i, True) data = {'url': i} keywords = [ w for w in count(wr.text, top=10, stemmer=LEMMA) if w not in stop ] if 'books.google' in i: text = '' else: text = wr.text data.update({ 'keywords': keywords, 'text': plaintext(text), 'title': wr.title, 'urls': wr.urls, 'type': 'result', 'index': index + 1, 'similar': [], 'duplicates': [], 'category': 0, }) if wr not in results.results.all(): results.results.add(wr) data['plaintext'] = data['text'].split('\n') # while '' in data['plaintext']: # data['plaintext'].remove('') # knowledgeKeywords.extend(data['keywords']) data_to_be_written.append(data) except Exception as e: print e print "Response Result model Prepared" if not analysis: return data_to_be_written list_of_sim_docs, model, m = find_similarity(data_to_be_written) for i in list_of_sim_docs: similar = { 'type': 'similar', 's': i.get('source'), 'd': i.get('dest'), 'source': i.get('source'), 'dest': i.get('dest'), 'score': i.get('score'), } data_to_be_written.append(similar) if similar['score'] > 0.9: for res in data_to_be_written: if res['type'] in [ 'result', 'duplicate' ] and res['url'] == i.get('dest') and len(res['text']) > 0: print "Duplicate [{0}].[{1}]".format( i['source'][:20], i['dest'][:20]) res['type'] = 'duplicate' items = [ Document(i.get('text'), name=i.get('url'), description=i.get('index'), stemmer=LEMMA) for i in data_to_be_written ] m = Model(items, weight=TFIDF) # k = 10 ####### BEGIN Experimental Setup ########## # v,d = m.features, m.documents # y,x = len(m.documents),len(m.features) def build_matrix(w=None, d=None): y, x = len(d), len(w) model = np.zeros((y, x)) for i in range(y): model[i] = [1 if w[j] in d[i].words else 0 for j in range(x)] return model # def find_word_matches(model, words = None, d = None): # y,x = model.shape # for i in range(y): # for j in range(i+1,y): # a = np.copy(model[i]) # b = np.copy(model[j]) # a_ones = np.count_nonzero(a) # b_ones = np.count_nonzero(b) # comparison = (a==b) # cross_product = a*b # intersection = np.count_nonzero(cross_product) # union = a_ones+b_ones-intersection # if a_ones+b_ones>0 and intersection > 0: # score = intersection/union # else: # score = 0 # if model[i].any() and model[j].any() and comparison.any() and score > 0.4: # print "Match [{0}] {1}:[{2} words] - [{3}] {4}:[{5} words] : {6} words".format(d[i].description,d[i].name[:30], np.count_nonzero(a), d[j].description,d[j].name[:30], np.count_nonzero(b), score, math.fabs(d[i].description - d[j].description)) # similar = { # 'type' : 'similar', # 'source' : d[i].name, # 'dest' : d[j].name, # 'score' : score, # } # data_to_be_written.append(similar) # if score >= 0.9: # for res in data_to_be_written: # if res['type'] in ['result','duplicate'] and res['url'] == d[j].name and len(res['text'])>0: # print "Duplicate [{0}].[{1}]".format(i+1,j+1) # res['type'] = 'duplicate' # return model def word_frequency(model, words=None, documents=None, threshold1=0, threshold2=1, transpose=False): "Returns frequent word amoung documents in range of threshold" y, x = model.shape data = {} for i in range(x): count = np.count_nonzero(model[:, i]) / y if count >= threshold1 and count <= threshold2: if words: data[words[i]] = count else: data[i] = count return data model = build_matrix(m.features, m.documents) # model = find_word_matches(model, m.features, m.documents) knowledgeKeywords = [ w for w in word_frequency(model, m.features, m.documents, 0.2, 0.8) ][:20] ####### END Experimental Setup ########## # c = m.cluster(method=HIERARCHICAL, k=k) # for i in c: # cluster = [] # k = [] # contains_text = False # for item in i: # for data in data_to_be_written: # if data.get('type') == 'result' and data.get('url')==item.name: # cluster.append({ # 'url' : data.get('url'), # 'index' : item.description, # }) # if data.get('text'): # k.extend([w for w in count(words(data.get('text')), top=50, stemmer = PORTER, exclude=[], stopwords=False, language='en')]) # contains_text=True # cluster = { # 'type' : 'cluster', # 'data' : cluster, # 'index' : min([c.get('index') for c in cluster] + [0]), # 'keywords' : [w for w in count(k, top=10, stemmer = PORTER, exclude=[], stopwords=False, language='en')] # } # cluster['contains_text'] = contains_text # data_to_be_written.append(cluster) # print "{0} results".format(len(data_to_be_written)) data_to_be_written.append({ 'type': 'meta', 'keywords': knowledgeKeywords, }) result = {} for i in data_to_be_written: if i.get('type') in ['result', 'duplicate']: url = i.get('url') index = int(i.get('index')) result[index] = [ 1 for r in data_to_be_written if r.get('type') == 'similar' and r['source'] == url ] result2 = [i for i, j in result.iteritems()] result3 = [len(j) for i, j in result.iteritems()] Process(target=plot_graph, args=(result2, result3)).start() return data_to_be_written
def crearModelo(self, listaDocumentos): '''Crear modelo de listas de documentos utilizando calculo de frencuencias TFIDF''' return Model(listaDocumentos, weight=TFIDF)
import cPickle as pickle con = pymongo.MongoClient() sentiment_res = con.tweets.sentiment_analysis sentiment_res_p = con.tweets.patterns_sentiment_analysis tweets = con.tweets.tweets_toronto docs = [] # with open('D:\\data\\documents.spkl', 'wb') as fp: # for tweet in tweets.find(): # doc = Document(tweet['text'],name=tweet['id']) # pickle.dump(doc, fp) # fp.close() # m = Model(documents=[], weight=TFIDF) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in range(tweets.count() / 100): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in xrange(tweets.count()): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) print len(m.documents) m.reduce(dimensions=L2) m.save
r2_list = [] for query in r2_queries: for r2 in r2_exhibits.get_by_example(query): try: strings = [r2['program_desc']] projects = [r2['projects'] for k in r2['projects'].keys()] for proj in projects: try: strings.append(proj['mission_desc']) except KeyError as e: pass doc = Document(" ".join(strings), name=r2['_id']) r2_list.append(doc) except KeyError as e: print repr(e) # not much to do about this m = Model(r2_list) def r2iterator_to_model(collection, query): r2_list = [] for r2 in collection.get_by_example(query): try: strings = [r2['program_desc']] for projid in r2['projects'].keys(): try: strings.append(r2['projects'][projid]['mission_desc']) except KeyError: pass try: doc = Document(" ".join(strings), name=r2['_id']) r2_list.append(doc) except TypeError as e:
class ClusterLSI(object): def __init__(self): """Setting up ClusterLSI environment """ self.field = "scopeAndContent" self.limit = False self.identifier = "idDoc" self.model = False self.cluster = False self.depth = 0 self.outputNodes = "./clus-nodes.csv" self.outputEdges = "./clus-edges.csv" def normalize(self, s): """Normalize a string Keyword arguments: s --- string """ if type(s) == unicode: return s.encode('utf8', 'ignore') else: return str(s) def modeling(self, descriptions, field = False, limit = False): """Model returns a pattern.vector.Model object which is a list of pattern.vector.Document using Ehri.Get() descriptions Keyword arguments: descriptions --- EHRI.get() description object field --- Field to look into, override defaut self.field limit --- Debug option. Limit the model to $limit items """ if field: self.field = field if limit: self.limit = limit D = [] #Creating Pattern Document element from data we got from Neo4J # #For debug reasons, we could set a limit if self.limit: i = 0 for description in descriptions: D.append(Document(description[self.field], name=description[self.identifier])) #And stop the iteration when i reaches the limit if self.limit: i += 1 if i == self.limit: break #Then, creating a model from our array self.model = Model(D) return self.model def clusterize(self, model = False): """Returns a cluster of given model Keyword arguments: model --- If set, override instance model """ if model: self.model = model self.cluster = self.model.cluster(method=HIERARCHICAL, k=2) return self.cluster def flatten(self, array, typeOf = "str"): """Returns a 1 dimension list with given type of item inside given array Keyword arguments: array --- A list of items typeOf --- Type of item the function should return """ #Flatten an array if typeOf == "str": return [element for element in array if isinstance(element, basestring)] elif typeOf == "list": return [element for element in array if isinstance(element, list)] def csv(self, array, parents = False, fake = 0): """Return a tuple of csv string with given items and number of fake items Keyword arguments: array --- A list of items parents --- A list of parents fake --- An index for fake parents """ string = "" #Making list of elements, avoid calling it once more currents = self.flatten(array, "str") children = self.flatten(array, "list") if len(currents) == 0: fake += 1 Ffake = fake #If we have parents, we have parents connections if parents: for element in currents: for parent in parents: string += self.normalize(element) + ";" + parent + "\n" #Taking care of children for child in children: if len(currents) > 0: Sstring, Ffake = self.csv(child, currents, Ffake) else: Sstring, Ffake = self.csv(child, ["fake-"+str(fake)], Ffake) string += Sstring return string, Ffake def clusterToArray(self, Graph): """Convert a cluster object to an array list with n-depth where depth is same as cluster.depth Keyword arguments: Graph --- Cluster or list """ array = [] Docs = [element for element in Graph if isinstance(element, pattern.vector.Document)] Clusts = [element for element in Graph if isinstance(element, list)] for node in Docs: array.append(node.name) for node in Clusts: array.append(self.clusterToArray(node)) return array def save(self, descriptions, csv, fakes = 0, nodesName = False, edgesName = False ): """Output cluster into csv files Keyword arguments: descriptions --- EHRI.get() description item fakes --- Number of fakes parents nodesName --- Filename for Nodes's CSV file edgesName --- Filename for Edges's CSV file """ if nodesName: self.outputNodes = nodesName if edgesName: self.outputEdges = edgesName f = open(self.outputNodes, "wt") f.write("id;label;type\n") for description in descriptions: f.write(self.normalize(description[self.identifier] + ";" + description[self.identifier] + ";1\n")) i=0 while i <= fakes: f.write("fake-" + str(i) + ";" + "fake" + str(i) + ";0\n") i+= 1 f.close() f = open(self.outputEdges, "wt") f.write("source;target\n"); f.write(csv) f.close()
refineddata1 = [(features(c[0]),c[1]) for c in refineddata] #Each datapoint becomes a pattern document here; The type represents the label for each document# refineddata2 = [Document(message, type=sideeffectindicator) for message, sideeffectindicator in refineddata1] #Defining the model using the documents; feature weight is Information gain; You can try changing to TF, TFIDF etc# model = Model(documents=refineddata2, weight=IG) #Top 500 features selected# features=model.feature_selection(top=500) #If medicine names are present they are removed # refinedfeatures = [] for i in features: if i not in medlist: refinedfeatures.append(i)
# -*- coding: utf-8 -*- from json import load from pattern.vector import Document, Model,L2 packages = load(file("packages.json")) docs = [Document(p['description'], name=p['name']) for p in packages] model = Model(docs) lsa = model.reduce(L2)
from pattern.vector import Document, Model d1 = Document('The cat purrs.', name='cat1') d2 = Document('Curiosity killed the cat.', name='cat2') d3 = Document('The dog wags his tail.', name='dog1') d4 = Document('The dog is happy.', name='dog2') m = Model([d1, d2, d3, d4]) m.reduce(2) for d in m.documents: print print d.name for concept, w1 in m.lsa.vectors[d.id].items(): for feature, w2 in m.lsa.concepts[concept].items(): if w1!=0 and w2!=0: print (feature, w1 * w2)
# the weights will be between 0.0-1.0 (their sum is 1.0). print document.copy() # document vector v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1}) v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1}) print 1 - distance(v1, v2) # model d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger') d2 = Document( 'A lion is a big yellow cat with manes.', type='lion', ) d3 = Document('An elephant is a big grey animal with a slurf.', type='elephant') print d1.vector m = Model(documents=[d1, d2, d3], weight=TFIDF) print d1.vector print m.similarity(d1, d2) # tiger vs. lion print m.similarity(d1, d3) # tiger vs. elephant # lsa concept space d1 = Document('The cat purrs.', name='cat1') d2 = Document('Curiosity killed the cat.', name='cat2') d3 = Document('The dog wags his tail.', name='dog1') d4 = Document('The dog is happy.', name='dog2') m = Model([d1, d2, d3, d4]) m.reduce(2) for d in m.documents: print print d.name for concept, w1 in m.lsa.vectors[d.id].items(): for feature, w2 in m.lsa.concepts[concept].items():
def loadTFIDF(): """Load the pickle file created by run TFIDF""" return Model.load("project/pickle/course.pic")
def initializeModel(): classifierModel = Model.load('classificationModel.slp') return classifierModel
from pattern.search import search from pattern.vector import Document, Model, KNN # Classification is a supervised machine learning method, # where labeled documents are used as training material # to learn how to label unlabeled documents. # This example trains a simple classifier with Twitter messages. # The idea is that, if you have a number of texts with a "type" # (mail/spam, positive/negative, language, author's age, ...), # you can predict the type of other "unknown" texts. # The k-Nearest Neighbor algorithm classifies texts according # to the k documents that are most similar (cosine similarity) to the # given input document. m = Model() t = Twitter() # First, we mine a model of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in t.search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels # parse tree with part-of-speech tags s = Sentence(parse(s)) s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0:
# to represent this. # A Model is a collection of documents vectors. # A Model is a matrix (or vector space) # with features as columns and feature weights as rows. # We can then do calculations on the matrix, # for example to compute TF-IDF or similarity between documents. # Load a model from a folder of text documents: documents = [] for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")): text = codecs.open(f, encoding="utf-8").read() name = os.path.basename(f)[:-4] documents.append(Document(text, name=name)) m = Model(documents, weight=TFIDF) # We can retrieve documents by name: d = m.document(name="lion") print(d.keywords(top=10)) print() print(d.tf("food")) # TF-IDF is less: "food" is also mentioned with the other animals. print(d.tfidf("food")) print() # We can compare how similar two documents are. # This is done by calculating the distance between the document vectors # (i.e., finding those that are near to each other).
# per comment to the document all_entry_comment_text_filtered += len(entry_comments) * \ " xxludumscrapecommentcounterxx " #print(all_entry_comment_text_filtered) # A 'document' is a bag of words from all comments for one game # entry (seems to work better grouping all comments), associated with # it's rating or classification (eg type=output_vector). documents.append(Document(all_entry_comment_text_filtered, name="%s\t%s" % (author, url), type=output_vector, stopwords=True)) vectors = [] if use_feature_selection: vectors = Model(documents=documents, weight=pattern.vector.TFIDF) vectors = vectors.filter( features=vectors.feature_selection(top=select_top_n_features)) #print(vectors.vectors) else: vectors = documents if options["train"]: if classifier_type == "SVM": classifier = SVM(train=vectors, type=svm_type, kernel=svm_kernel) else: classifier = getattr(pattern.vector, classifier_type)(train=vectors) print("Classes: " + repr(classifier.classes))
# to represent this. # A Model is a collection of documents vectors. # A Model is a matrix (or vector space) # with features as columns and feature weights as rows. # We can then do calculations on the matrix, # for example to compute TF-IDF or similarity between documents. # Load a model from a folder of text documents: documents = [] for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")): text = codecs.open(f, encoding="utf-8").read() name = os.path.basename(f)[:-4] documents.append(Document(text, name=name)) m = Model(documents, weight=TFIDF) # We can retrieve documents by name: d = m.document(name="lion") print d.keywords(top=10) print print d.tf("food") print d.tfidf( "food") # TF-IDF is less: "food" is also mentioned with the other animals. print # We can compare how similar two documents are. # This is done by calculating the distance between the document vectors # (i.e., finding those that are near to each other).
# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite. # Take 250 positive reviews and 250 negative reviews: data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "polarity-en-pang&lee1.csv") data = Datasheet.load(data) data = data[:250] + data[-250:] # Build a model of movie reviews. # Each document consists of the top 40 words in the movie review. documents = [] for score, review in data: document = Document(review, stopwords=False, top=40, type=int(score) > 0) documents.append(document) m = Model(documents) print("number of documents:", len(m)) print("number of features:", len(m.vector)) print("number of features (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # 6,337 different features may be too slow for some algorithms (e.g., hierarchical clustering). # We'll reduce the document vectors to 10 concepts. # Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data
def recommend_game(this_game): games = recommendable_games(this_game) total_recommendable = games.count() print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__() document_title = Document(this_game.title) document_publisher = Document(this_game.publisher) document_summary = Document(this_game.summary, top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language='en') document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")])) document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")])) # format: {"id":id, socre:"SUM(dist*pond)"} game_similarities = [] summary_documents = [] for game in games: score = 0 game = Game.objects.filter(title=game['title'], platform=game['platform'])[0] title_similarity = 1 - distance(document_title.vector, Document(game.title).vector) publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector) genre_similarity = 1 - distance(document_genres.vector, Document( ', '.join([x['name'] for x in game.genres.all().values("name")]) ).vector) keywords_similarity = 1 - distance(document_keywords.vector, Document( ', '.join([x['name'] for x in game.keywords.all().values("name")]) ).vector) score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + ( 0.20 * keywords_similarity) summary_documents.append(Document(game.summary, top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language='en', name=game.id)) game_similarities.append({"id": game.id, "score": score}) to_compare = Document(document_summary) model = Model(documents=summary_documents, weight=TFIDF) neighbours = model.neighbors(to_compare, top=total_recommendable) for neighbour in neighbours: for rec_game in game_similarities: if rec_game['id'] == neighbour[1].name: rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0] recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable] if len(recommended) >= 40: random_selection = random.sample(recommended[0:40], 25) else: random_selection = random.sample(recommended, 25) recommended_ids = [g['id'] for g in random_selection] return recommended_ids
# but is is still popular because it is fast for models # that have many documents and many features. # It is outperformed by KNN and SVM, but useful as a baseline for tests. # We'll test it with a corpus of spam e-mail messages, # included in the test suite, stored as a CSV-file. # The corpus contains mostly technical e-mail from developer mailing lists. data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "spam-apache.csv") data = Datasheet.load(data) documents = [] for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) m = Model(documents) print("number of documents:", len(m)) print("number of words:", len(m.vector)) print("number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document)
from pattern.vector import Document, Model, IG, TF, TFIDF, BINARY import sys import os print "Reading sample code and instantiating documents..." documents = [] exampleDir = "examples/" for file in os.listdir(exampleDir): if os.path.isdir(exampleDir + file): for subfile in os.listdir(exampleDir + file): if (os.path.isfile(exampleDir + file + "/" + subfile)): with open (exampleDir + file + "/" + subfile, "r") as langDoc: text = langDoc.read() doc = Document(text, type=file) documents.append(doc) print "Creating statistical model..." m = Model(documents=documents, weight=IG) # Test with sample Java doc print "Comparing test document..." with open ("coffee.txt", "r") as myfile: testFile = myfile.read() testDoc = Document(testFile, type='Java') testSimilarities = m.neighbors(testDoc, top=10) prediction = testSimilarities[0][1].type #neighbors() returns (similarity, document) list confidence = testSimilarities[0][0] print "LanguageLearn has predicted " + testSimilarities[0][1].type + " with a " + str(round(confidence * 100, 2)) + "% confidence"