def clean_steam(): documents = Document.objects.all() goal = 0 current = 0 leng = len(documents) for document in documents: goal, current = avance(current, leng, goal) if document.steamed_content: text_to_clean = document.steamed_content aux = unicode(text_to_clean) #Quito <>,[], <!-- --> y saltos de linea aux = strip_tags(aux) #quito espacios en bordes, llevo a lowercase y saco tildes aux = ' '+remove_non_unicode(aux.strip().lower())+' ' #quito Numeros y Caracteres aux = remove_non_alphanumeric(aux) #quito espacios aux = remove_spaces(aux) document.steamed_content = aux document.save()
def clean_content(request): """Limpia el contenido de un documento filtrando stop words y caracteres que no son letras""" documents = Document.objects.filter(Q(cleaned_content='') | Q(cleaned_content=None)| Q(steamed_content='') | Q(steamed_content=None)) goal = 0 current = 0 leng = len(documents) print " -> Removing Stop Words and weird chars..." sw = Stopword.objects.all() stopwords = '|'.join([" "+str(x)+" " for x in sw]) print " -> Cleaning Documents" for d in documents: goal, current = avance(current, leng, goal) if not d.cleaned_content: d.clean_content(stopwords) if d.cleaned_content: #d.stemmed_content = freeling_stemming(d.cleaned_content) d.save() else: d.delete() print " Documents cleaned!"
def remove_common_words(ldamodel): """ This method removes words which have similar frequency among diffrent datasets """ print "Checking and Removing common words" remove_list = [] n_of_datasets = DataSetLdaModel.objects.filter(ldamodel=ldamodel).count() datasets = DataSet.objects.filter(datasetldamodel__ldamodel = ldamodel) if n_of_datasets < 2: return lda_words = WordLdaModel.objects.filter(ldamodel=ldamodel) goal = 0 current = 0 leng = len(lda_words) for this_word in lda_words: goal, current = avance(current, leng, goal) freq_table = n_of_datasets*[0] #print freq_table wsf_involved = WordDataSetFrequency.objects.filter(word = this_word, dataset__in = datasets) #print wsf_involved for i in range(0,len(wsf_involved)): freq_table[i] = wsf_involved[i].frequency freq_tot = sum(freq_table) freq_avg = float(freq_tot)/n_of_datasets # Promedio deltas delta_avg = 0 for i in range(0,n_of_datasets-1): for j in range(i+1,n_of_datasets): delta_avg += abs(freq_table[i]-freq_table[j]) delta_avg = float(delta_avg)*2/((n_of_datasets-1)*n_of_datasets) # Remove if delta_avg < freq_avg: remove_list.append(str(this_word.id)) if remove_list: sql = "DELETE FROM application_wordldamodel WHERE id IN (%s)" % ",".join(remove_list) cursor = connection.cursor() cursor.execute(sql) cursor.execute("COMMIT") connection.close() print " -> %s Words removed" % len(remove_list) else: print " -> No words removed"
def load_words(request, ldamodel = None): cursor = connection.cursor() """Carga el diccionario a partir de las palabras previamente limpiadas""" print "Reading words we already have..." historic_words = [] for w in Word.objects.all(): historic_words.append(w.name) print "We have %s words..." % len(historic_words) dictionary = {} print "Loading words..." documents = Document.objects.filter(loaded_words = 0) i = 0 print " -> %s documents loaded..." % len(documents) for d in documents: for a in d.cleaned_content.split(" "): dictionary[a] = 1 goal = 0 current = 0 leng = len(dictionary) for d in dictionary: goal, current = avance(current, leng, goal) if d not in historic_words: w = Word(name = d) try: w.save() except Exception as e: #print e[0] if not int(e[0]) == 1062: print "Fallo "+w.name print " Words Loaded!" print " Updating documents..." cursor.execute("UPDATE application_document SET loaded_words = 1 WHERE loaded_words = 0;") cursor.execute("COMMIT") print " Documents Updated!" connection.close()
def get_freq(request, ldamodel = None): """calculos de la tabla frequency""" #LIMPIO LA TABLA cursor = connection.cursor() print "Getting Frequencies..." documents = Document.objects.filter(frec_calculated = 0) #Caluclo LAS FRECUENCIAS wf = {} words = {} for w in Word.objects.all(): words[w.name] = w print " Getting simple frequency calculated..." goal = 0 current = 0 leng = len(documents) i = 0 for d in documents: goal, current = avance(current, leng, goal) wf.clear() #print "Counting Words..." if ldamodel and ldamodel.stemming and d.steamed_content: if i == 0: print "using stemming" i += 1 content = d.steamed_content.split(' ') else: content = d.cleaned_content.split(' ') for s in content: wf[s] = wf.get(s,0) + 1 #print "Saving..." freqs = [] for kw in wf: try: freqs.append("("+str(d.id)+","+str(words[kw].id)+","+str(wf[kw])+")") except Exception as e: print "En documento %s error:" % d.id print e query = "INSERT INTO application_frequency (document_id, word_id, frequency) VALUES %s;" % ",".join(freqs) #print query cursor.execute(query) cursor.execute("COMMIT") print " Simple frequency calculated!" print " Getting max frequency..." cursor.execute("""UPDATE application_document D SET max_word_frec=(SELECT MAX(F.frequency) FROM application_frequency F WHERE F.document_id=D.id AND max_word_frec is NULL)""") print " Max frequency calculated!" print " Updating documents..." cursor.execute("UPDATE application_document SET frec_calculated = 1 WHERE frec_calculated = 0;") print " Documents Updated!" cursor.execute("COMMIT") connection.close()
def lda(documents_dist,topic_local_to_universal,alpha,beta): """ Runs LDA over a set of documents, saving results over a set of predefined topics """ cursor = connection.cursor() n_topics = len(topic_local_to_universal) word_local_to_universal = {} word_universal_to_local = {} document_local_to_universal = {} print "Getting document matrix..." dic = [word_mapper(map(lambda x: int(str(x),16),document_dist.distribution[:-1].split(',')),word_local_to_universal,word_universal_to_local) for document_dist in documents_dist] document_local_to_universal = dict(enumerate([document_dist.document.id for document_dist in documents_dist])) n_documents = str(len(dic)) n_words = len(word_local_to_universal) print "Numero de documentos: "+str(n_documents) print "Numero de palabras: "+str(n_words) if int(n_documents) == 0: raise Exception('LDAmodel has no documents assigned or the documents had only irrelevant words. No document matrix founded.') f_label = 1 numsamp = 50 randseed = 194582 alpha_vector = alpha * ones((f_label,n_topics)) beta_vector = beta * ones((n_topics,n_words)) print "Calculating LDA using..." print " beta: "+str(beta) print " alpha: "+str(alpha) print " ntopics: "+str(n_topics) (phi,theta,sample) = deltaLDA(dic,alpha_vector,beta_vector,numsamp,randseed) print "Saving Results..." ######################## # document_topic ######################## print "Saving Document and topic correlation..." document_local_id = 0 goal = 0 current = 0 theta_len = len(theta) for d in theta: st = "INSERT INTO application_documenttopic (document_id, topic_id, value) VALUES " goal, current = avance(current, theta_len, goal) topic_local_id = 0 for document_weight in d: st = st + "("+str(document_local_to_universal[document_local_id])+","+str(topic_local_to_universal[topic_local_id])+","+str(document_weight)+")," topic_local_id += 1 st = st[:-1]+";" cursor.execute(st) cursor.execute("COMMIT") document_local_id += 1 ##################### # topic_word ##################### print "Saving topics and word correlation to file" topic_local_id = 0 goal = 0 current = 0 phi_len = len(phi) nbest = int(n_words*0.5) os.system("touch /tmp/application_topicword.txt") os.system("chmod 777 /tmp/application_topicword.txt") FILE = '/tmp/application_topicword.txt' print 'Opening %s' % FILE fw = open (FILE,'w') for t in phi: goal, current = avance(current, phi_len, goal) word_local_id = 0 for word_weight in t: fw.write(str(topic_local_to_universal[topic_local_id])+';'+str(word_local_to_universal[word_local_id])+';'+str(word_weight)+'\n') word_local_id += 1 topic_local_id += 1 fw.close() load_data_in_file() return True