def main(No_ch=0):
    #######################DEPRESION#################################
    # Adquisicion del corpus >>>>>>>> INICIO
    print("Adquisicion de corpus de depresion")
    dic.chunks_paths = []
    dic.loadchunkXML('dpp')
    dic.analyzeChunk('dpp', No_ch)
    dic.chunks_paths = []
    dic.loadchunkXML('dpn')
    dic.analyzeChunk('dpn', No_ch)
    print('Numero de chunks en types ', len(dic.types['dpp']))
    print('Numero de chunks en types ', len(dic.types['dpn']))
    dic.initialize_class_types('dp')
    dic.appendPost('dpp', 'dp')
    dic.appendPost('dpn', 'dp')
    print('Numero de instancias en depresion', len(dic.types['dp']['rows']))
    dic.types['dp']['cols'] = dic.fillOnesZeros('dp')
    print('Matriz Y', len(dic.types['dp']['cols']))
    dic.types['dp']['names'] = ['Negative', 'Positive']

    #    print(dic.types['dp']['rows'])
    # Adquisicion del corpus >>>>>>>> FIN
    # Normalizado del corpus >>>>>>>>>> INICIO
    norm_train_corpus = norm.parseForTokensFixed(dic.types['dp']['rows'])
    print(norm_train_corpus)
Example #2
0
def parsing(illness='dp',cat=0,No_ch=0):
    ilM = []
    if(No_ch!=0):
        gd.chunks_paths = []
        if(illness=='dp' and cat == 0):
            print('DEBUGG:__Depresion negativo')
            gd.loadchunkXML('dpn')
            ilchunk = ch.Chunk(No_ch)
            for v in gd.chunks_paths[No_ch - 1]:
                (uid, posts) = gd.PostForUser(v)
                ilchunk.newUser(uid,posts)
            ilM = ilchunk.getidlvMatrix()
        elif(illness=='dp' and cat == 1):
            print('DEBUG:__Depresion positivo')
            gd.loadchunkXML('dpp')
            ilchunk = ch.Chunk(No_ch)
            for v in gd.chunks_paths[No_ch - 1]:
                (uid, posts) = gd.PostForUser(v)
                ilchunk.newUser(uid,posts)
            ilM = ilchunk.getidlvMatrix()
        elif(illness=='ax' and cat == 0):
            print('DEBUG:__Anorexia negativo')
            gd.loadchunkXML('axn')
            ilchunk = ch.Chunk(No_ch)
            for v in gd.chunks_paths[No_ch - 1]:
                (uid, posts) = gd.PostForUser(v)
                ilchunk.newUser(uid,posts)
            ilM = ilchunk.getidlvMatrix()
        else:
            print('DEBUG:__Anorexia positivo')
            gd.loadchunkXML('axp')
            ilchunk = ch.Chunk(No_ch)
            for v in gd.chunks_paths[No_ch - 1]:
                (uid, posts) = gd.PostForUser(v)
                ilchunk.newUser(uid,posts)
            ilM = ilchunk.getidlvMatrix()
    return ilM
Example #3
0
import normalization as norm
import sys
from sklearn.feature_extraction.text import CountVectorizer
import Chunk as ch

if len(sys.argv) == 2:
    No_ch = sys.argv[1]

No_ch = int(No_ch)
# No_ch = 1

print("Chunk ", sys.argv[1])
# Adquisicion del corpus >>>>>>>> INICIO
print("Adquisición de corpus de depresion")
dic.chunks_paths = []
dic.loadchunkXML('dpp')
dic.analyzeChunk('dpp', No_ch)
chunk = ch.Chunk(No_ch)
for v in dic.chunks_paths[No_ch - 1]:
    (uid, posts) = dic.PostForUser(v)
    chunk.newUser(uid, posts)

dic.chunks_paths = []
dic.loadchunkXML('dpn')
dic.analyzeChunk('dpn', No_ch)
for v in dic.chunks_paths[No_ch - 1]:
    (uid, posts) = dic.PostForUser(v)
    chunk.newUser(uid, posts)

print('Numero de chunks en types ', len(dic.types['dpp']))
print('Numero de chunks en types ', len(dic.types['dpn']))
                p = re.compile(llave, re.IGNORECASE)
                if re.match(p,token):
                    # print('Match!\tToken: ', token, '\tLidwc: ', llave)
                    fo = fo + 1
                    for i in lidwc[llave]:
                        lidwc_cats[i] = lidwc_cats[i] + 1
            elif token.lower() == llave:
                # print('Match!\tToken: ', token, '\tLidwc: ', llave)
                fo = fo + 1
                for i in lidwc[llave]:
                    lidwc_cats[i] = lidwc_cats[i] + GD.types['dpn']['chunk'+str(ch)][token]
    return fo


print('Cargando los chuncks\t.\t-\t\\\t|\t/\t-\t.\t1<=>3\n')
GD.loadchunkXML('dpn')
print('Cargando los diccionarios\t.\t-\t\\\t|\t/\t-\t.\t2<=>3\n')
loaddictionary()
print('Analizando los tipos:\t.\t-\t\\\t|\t/\t-\t.\t3<=>3\n')
print('Chunk\tMatches\tTipos\ttokens')
for ch in range(1,11):
    fo = parseForTypes(ch)
    FH = open('../dpn_categoriasCHUNK'+str(ch)+'.tsv', 'w')
    for i in lidwc_cats.keys():
        FH.write(str(i)+'\t'+str(lidwc_cats[i])+'\n')
    FH.close()
    # pr = fo / len(GD.types['dpn']['chunk3'])
    # print('Tokens en chunk3:\t', len(GD.types['dpn']['chunk3']), '\nCoincidencias en Liwc:\t', str(fo), '\nRepresentatividad:\t', str(pr))
    numTokens = 0
    FH = open('../dpn_diccionarioCHUNK'+str(ch)+'.tsv', 'w')
    for tipo in GD.types['dpn']['chunk'+str(ch)].keys():
Example #5
0
def main(No_ch=0):
    # Adquisicion del corpus >>>>>>>> INICIO
    print("Adquisición de corpus de depresion")
    dic.chunks_paths = []
    dic.loadchunkXML('dpp')
    dic.analyzeChunk('dpp', No_ch)
    dic.chunks_paths = []
    dic.loadchunkXML('dpn')
    dic.analyzeChunk('dpn', No_ch)
    print('Numero de chunks en types ', len(dic.types['dpp']))
    print('Numero de chunks en types ', len(dic.types['dpn']))
    dic.initialize_class_types('dp')
    dic.appendPost('dpp', 'dp')
    dic.appendPost('dpn', 'dp')
    print('Numero de instancias en depresion', len(dic.types['dp']['rows']))
    dic.types['dp']['cols'] = dic.fillOnesZeros('dp')
    print('Matriz Y', len(dic.types['dp']['cols']))
    dic.types['dp']['names'] = ['Negative', 'Positive']
    # Adquisicion del corpus >>>>>>>> FIN
    # Normalizado del corpus >>>>>>>>>> INICIO
    norm_train_corpus = norm.normalize_corpus(dic.types['dp']['rows'])
    # Normalizado del corpus >>>>>>>>>> FIN
    from feature_extractor import bow_extractor, tfidf_extractor, bow_extractor_maxdf
    from sklearn.feature_selection import mutual_info_classif
    import nltk
    import gensim
    # BOW features
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    feature_names = bow_vectorizer.get_feature_names()
    print('Numero de caracteristicas tomadas en cuenta', len(feature_names))
    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    from sklearn.model_selection import cross_val_predict
    y_predicted = cross_val_predict(nb,
                                    bow_train_features,
                                    dic.types['dp']['cols'],
                                    cv=10)
    evaluator.get_metrics(dic.types['dp']['cols'], y_predicted)
    bow_vectorizer, bow_train_features = bow_extractor_maxdf(norm_train_corpus)
    res = dict(
        zip(
            feature_names,
            mutual_info_classif(bow_train_features,
                                dic.types['dp']['cols'],
                                discrete_features=True)))
    for feat in res.keys():
        print(feat, str(res[feat]), '\n')
    # y_predicted = cross_val_predict( nb, bow_train_features, dic.types['dp']['cols'], cv=10)
    # evaluator.get_metrics(dic.types['dp']['cols'], y_predicted)
    # Adquisicion del corpus >>>>>>>> INICIO
    print("Adquisición de corpus de anorexia")
    dic.chunks_paths = []
    dic.loadchunkXML('axp')
    dic.analyzeChunk('axp', No_ch)
    dic.chunks_paths = []
    dic.loadchunkXML('axn')
    dic.analyzeChunk('axn', No_ch)
    print('Numero de chunks en types ', len(dic.types['axp']))
    print('Numero de chunks en types ', len(dic.types['axn']))
    dic.initialize_class_types('ax')
    dic.appendPost('axp', 'ax')
    dic.appendPost('axn', 'ax')
    print('Numero de instancias en anorexia', len(dic.types['ax']['rows']))
    dic.types['ax']['cols'] = dic.fillOnesZeros('ax')
    print('Matriz Y', len(dic.types['ax']['cols']))
    dic.types['ax']['names'] = ['Negative', 'Positive']
    # Adquisicion del corpus >>>>>>>> FIN
    # Normalizado del corpus >>>>>>>>>> INICIO
    norm_train_corpus = norm.normalize_corpus(dic.types['ax']['rows'])
    # Normalizado del corpus >>>>>>>>>> FIN
    # BOW features
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    feature_names = bow_vectorizer.get_feature_names()
    print('Numero de caracteristicas tomadas en cuenta', len(feature_names))
    nb = MultinomialNB()
    y_predicted = cross_val_predict(nb,
                                    bow_train_features,
                                    dic.types['ax']['cols'],
                                    cv=10)
    evaluator.get_metrics(dic.types['ax']['cols'], y_predicted)
    bow_vectorizer, bow_train_features = bow_extractor_maxdf(norm_train_corpus)
    res = dict(
        zip(
            feature_names,
            mutual_info_classif(bow_train_features,
                                dic.types['ax']['cols'],
                                discrete_features=True)))
    for feat in res.keys():
        print(feat, str(res[feat]), '\n')
Example #6
0
lidwc = {}
lidwc_cats = {}
clases = ['axp','axn','dpp','dpn']
# clases = ['axp']
print("Cargando Lidwc: \tdiccionario\t...(1,5)\n")
lidwc = LDW.getWORDS()
print("Cargando Lidwc: \tcategorias\t...(2,5)\n")
lidwc_cats = LDW.getNUM()
print("Analisis de clases\t\t...(3,5)\n")
for clase in clases:
    GD.chunks_paths = []
    print('Cargando los chunks de ', clase, '\t\t...(4,5)\n')
    FH = open('../ForUserAnalysis/'+clase+'.tsv', 'a')
    FH.write('usuario\tchunk\tposts\tliwc\tvocabulary\ttokens\n')
    GD.loadchunkXML(clase)
    print('Estadisticas base por xml\t\t...(5,5)\n')
    for chunk in GD.chunks_paths:
        for xml in chunk:
            m = re.search('subject[0-9]+_([0-9]{1,2})\.xml',xml)
            (usuario, no_post, diccionario) = GD.typesforUser(xml)
            fo = parseForTypes(usuario, diccionario)
            CFH = open('../ForUserAnalysis/'+clase+'_categoriasCHUNK'+m.group(1)+'_'+usuario+'.tsv', 'w')
            for i in lidwc_cats.keys():
                CFH.write(str(i)+'\t'+str(lidwc_cats[i])+'\n')
            CFH.close()
            numTokens = 0
            LFH = open('../ForUserAnalysis/'+clase+'_diccionarioCHUNK'+m.group(1)+'_'+usuario+'.tsv', 'w')
            for tipo in diccionario.keys():
                numTokens = numTokens + diccionario[tipo]
                LFH.write(tipo + '\t' + str(diccionario[tipo])+'\n')