Example #1
0
def clean_steam():

    documents = Document.objects.all()
    goal = 0
    current = 0
    leng = len(documents)
    for document in documents:
        goal, current = avance(current, leng, goal)
        if document.steamed_content:
            text_to_clean = document.steamed_content

            aux = unicode(text_to_clean)

            #Quito <>,[], <!-- --> y saltos de linea
            aux = strip_tags(aux)

            #quito espacios en bordes, llevo a lowercase y saco tildes
            aux = ' '+remove_non_unicode(aux.strip().lower())+' '

            #quito Numeros y Caracteres
            aux = remove_non_alphanumeric(aux)

            #quito espacios
            aux = remove_spaces(aux)

            document.steamed_content = aux
            document.save()
Example #2
0
def clean_content(request):
    """Limpia el contenido de un documento filtrando stop words y caracteres que no son letras"""

    documents = Document.objects.filter(Q(cleaned_content='') | Q(cleaned_content=None)| Q(steamed_content='') | Q(steamed_content=None))

    goal = 0
    current = 0
    leng = len(documents)
    print " -> Removing Stop Words and weird chars..."

    sw = Stopword.objects.all()
    stopwords = '|'.join([" "+str(x)+" " for x in sw])

    print " -> Cleaning Documents"
    for d in documents:
        goal, current = avance(current, leng, goal)
        if not d.cleaned_content:
            d.clean_content(stopwords)
            if d.cleaned_content:
                #d.stemmed_content = freeling_stemming(d.cleaned_content)
                d.save()
            else:
                d.delete()

    print "  Documents cleaned!"
Example #3
0
def remove_common_words(ldamodel):
    """ This method removes words which have similar frequency among diffrent datasets """
    print "Checking and Removing common words"

    remove_list = []

    n_of_datasets = DataSetLdaModel.objects.filter(ldamodel=ldamodel).count()
    datasets = DataSet.objects.filter(datasetldamodel__ldamodel = ldamodel)
    if n_of_datasets < 2: return

    lda_words = WordLdaModel.objects.filter(ldamodel=ldamodel)
    
    goal = 0
    current = 0
    leng = len(lda_words)

    for this_word in lda_words:

        goal, current = avance(current, leng, goal)
        freq_table = n_of_datasets*[0]
        #print freq_table
        wsf_involved = WordDataSetFrequency.objects.filter(word = this_word, dataset__in = datasets)
        #print wsf_involved

        for i in range(0,len(wsf_involved)):
            freq_table[i] = wsf_involved[i].frequency

        freq_tot = sum(freq_table)
        freq_avg = float(freq_tot)/n_of_datasets

        # Promedio deltas
        delta_avg = 0
        for i in range(0,n_of_datasets-1):
            for j in range(i+1,n_of_datasets):
                delta_avg += abs(freq_table[i]-freq_table[j])
        delta_avg = float(delta_avg)*2/((n_of_datasets-1)*n_of_datasets)

        # Remove
        if delta_avg < freq_avg:
            remove_list.append(str(this_word.id))

    if remove_list:
        
        sql = "DELETE FROM application_wordldamodel WHERE id IN (%s)" % ",".join(remove_list)
        cursor = connection.cursor()
        cursor.execute(sql)
        cursor.execute("COMMIT")
        connection.close()
        print " -> %s Words removed" % len(remove_list)

    else:
        print " -> No words removed"
Example #4
0
def load_words(request, ldamodel = None):
    cursor = connection.cursor()
    """Carga el diccionario a partir de las palabras previamente limpiadas"""


    print "Reading words we already have..."
    historic_words = []
    for w in Word.objects.all():
        historic_words.append(w.name)

    print "We have %s words..." % len(historic_words)

    dictionary = {}

    print "Loading words..."
    documents = Document.objects.filter(loaded_words = 0)

    i = 0
    print " -> %s documents loaded..." % len(documents)
    for d in documents:
        for a in d.cleaned_content.split(" "): dictionary[a] = 1

    goal = 0
    current = 0
    leng = len(dictionary)

    for d in dictionary:
        goal, current = avance(current, leng, goal)
        if d not in historic_words:
            w = Word(name = d)
            try:
                w.save()
            except Exception as e:
                #print e[0]
                if not int(e[0]) == 1062:
                    print "Fallo "+w.name
    print "  Words Loaded!"
    
    print "  Updating documents..."
    cursor.execute("UPDATE application_document SET loaded_words = 1 WHERE loaded_words = 0;")
    cursor.execute("COMMIT")
    print "  Documents Updated!"
    connection.close()
Example #5
0
def get_freq(request, ldamodel = None):
    """calculos de la tabla frequency"""
    #LIMPIO LA TABLA
    cursor = connection.cursor()
    print "Getting Frequencies..."

    documents = Document.objects.filter(frec_calculated = 0)

    #Caluclo LAS FRECUENCIAS
    wf = {}
    words = {}
    for w in Word.objects.all():
        words[w.name] = w

    print "  Getting simple frequency calculated..."
    goal = 0
    current = 0
    leng = len(documents)
    i = 0
    for d in documents:
        goal, current = avance(current, leng, goal)

        wf.clear()
        #print "Counting Words..."
        if ldamodel and ldamodel.stemming and d.steamed_content:
            if i == 0: print "using stemming"
            i += 1
            content = d.steamed_content.split(' ')
        else:
            content = d.cleaned_content.split(' ')
        
        for s in content:
            wf[s] = wf.get(s,0) + 1

        #print "Saving..."
        freqs = []
        for kw in wf:
            try:
                freqs.append("("+str(d.id)+","+str(words[kw].id)+","+str(wf[kw])+")")
            except Exception as e:
                print "En documento %s error:" % d.id
                print e
        
        query = "INSERT INTO application_frequency (document_id, word_id, frequency) VALUES %s;" % ",".join(freqs)
        #print query

        cursor.execute(query)
        cursor.execute("COMMIT")
    print "  Simple frequency calculated!"
    
    print "  Getting max frequency..."
    cursor.execute("""UPDATE application_document D SET max_word_frec=(SELECT MAX(F.frequency) FROM application_frequency F WHERE F.document_id=D.id AND max_word_frec is NULL)""")
    print "  Max frequency calculated!"
    
    print "  Updating documents..."
    cursor.execute("UPDATE application_document SET frec_calculated = 1 WHERE frec_calculated = 0;")
    print "  Documents Updated!"

    cursor.execute("COMMIT")
    
    connection.close()
Example #6
0
def lda(documents_dist,topic_local_to_universal,alpha,beta):
    """ Runs LDA over a set of documents, saving results over a set of predefined topics """

    cursor = connection.cursor()
    n_topics = len(topic_local_to_universal)
    
    word_local_to_universal = {}
    word_universal_to_local = {}
    
    document_local_to_universal = {}
    
    print "Getting document matrix..."

    dic = [word_mapper(map(lambda x: int(str(x),16),document_dist.distribution[:-1].split(',')),word_local_to_universal,word_universal_to_local) for document_dist in documents_dist]
    document_local_to_universal = dict(enumerate([document_dist.document.id for document_dist in documents_dist]))

    n_documents = str(len(dic))
    n_words = len(word_local_to_universal)
    
    print "Numero de documentos: "+str(n_documents)
    print "Numero de palabras: "+str(n_words)
    
    if int(n_documents) == 0:
        raise Exception('LDAmodel has no documents assigned or the documents had only irrelevant words. No document matrix founded.')
    
    f_label = 1
    numsamp = 50
    randseed = 194582

    alpha_vector = alpha * ones((f_label,n_topics))
    beta_vector = beta * ones((n_topics,n_words)) 

    print "Calculating LDA using..."
    print "   beta: "+str(beta)
    print "   alpha: "+str(alpha)
    print "   ntopics: "+str(n_topics)

    (phi,theta,sample) = deltaLDA(dic,alpha_vector,beta_vector,numsamp,randseed)
    print "Saving Results..."
    
    ########################
    #    document_topic
    ########################
              
    print "Saving Document and topic correlation..."
    document_local_id = 0
    goal = 0
    current = 0
    theta_len = len(theta)
    for d in theta:
        st = "INSERT INTO application_documenttopic (document_id, topic_id, value) VALUES "
        goal, current = avance(current, theta_len, goal)
        topic_local_id = 0
        for document_weight in d:
            st = st + "("+str(document_local_to_universal[document_local_id])+","+str(topic_local_to_universal[topic_local_id])+","+str(document_weight)+"),"
            topic_local_id += 1
        st = st[:-1]+";"
        cursor.execute(st)
        cursor.execute("COMMIT")
        document_local_id += 1
    
    #####################          
    #    topic_word
    #####################
    
    print "Saving topics and word correlation to file"
    topic_local_id = 0
    goal = 0
    current = 0
    phi_len = len(phi)
    nbest = int(n_words*0.5)


    os.system("touch /tmp/application_topicword.txt")
    os.system("chmod 777 /tmp/application_topicword.txt")
    FILE = '/tmp/application_topicword.txt'
    print 'Opening %s' % FILE
    fw = open (FILE,'w')
    
    for t in phi:
        goal, current = avance(current, phi_len, goal)
        word_local_id = 0
        for word_weight in t:
            fw.write(str(topic_local_to_universal[topic_local_id])+';'+str(word_local_to_universal[word_local_id])+';'+str(word_weight)+'\n')
            word_local_id += 1
        topic_local_id += 1

    fw.close()
    
    load_data_in_file()
    
    return True