def main(No_ch=0): #######################DEPRESION################################# # Adquisicion del corpus >>>>>>>> INICIO print("Adquisicion de corpus de depresion") dic.chunks_paths = [] dic.loadchunkXML('dpp') dic.analyzeChunk('dpp', No_ch) dic.chunks_paths = [] dic.loadchunkXML('dpn') dic.analyzeChunk('dpn', No_ch) print('Numero de chunks en types ', len(dic.types['dpp'])) print('Numero de chunks en types ', len(dic.types['dpn'])) dic.initialize_class_types('dp') dic.appendPost('dpp', 'dp') dic.appendPost('dpn', 'dp') print('Numero de instancias en depresion', len(dic.types['dp']['rows'])) dic.types['dp']['cols'] = dic.fillOnesZeros('dp') print('Matriz Y', len(dic.types['dp']['cols'])) dic.types['dp']['names'] = ['Negative', 'Positive'] # print(dic.types['dp']['rows']) # Adquisicion del corpus >>>>>>>> FIN # Normalizado del corpus >>>>>>>>>> INICIO norm_train_corpus = norm.parseForTokensFixed(dic.types['dp']['rows']) print(norm_train_corpus)
def parsing(illness='dp',cat=0,No_ch=0): ilM = [] if(No_ch!=0): gd.chunks_paths = [] if(illness=='dp' and cat == 0): print('DEBUGG:__Depresion negativo') gd.loadchunkXML('dpn') ilchunk = ch.Chunk(No_ch) for v in gd.chunks_paths[No_ch - 1]: (uid, posts) = gd.PostForUser(v) ilchunk.newUser(uid,posts) ilM = ilchunk.getidlvMatrix() elif(illness=='dp' and cat == 1): print('DEBUG:__Depresion positivo') gd.loadchunkXML('dpp') ilchunk = ch.Chunk(No_ch) for v in gd.chunks_paths[No_ch - 1]: (uid, posts) = gd.PostForUser(v) ilchunk.newUser(uid,posts) ilM = ilchunk.getidlvMatrix() elif(illness=='ax' and cat == 0): print('DEBUG:__Anorexia negativo') gd.loadchunkXML('axn') ilchunk = ch.Chunk(No_ch) for v in gd.chunks_paths[No_ch - 1]: (uid, posts) = gd.PostForUser(v) ilchunk.newUser(uid,posts) ilM = ilchunk.getidlvMatrix() else: print('DEBUG:__Anorexia positivo') gd.loadchunkXML('axp') ilchunk = ch.Chunk(No_ch) for v in gd.chunks_paths[No_ch - 1]: (uid, posts) = gd.PostForUser(v) ilchunk.newUser(uid,posts) ilM = ilchunk.getidlvMatrix() return ilM
import normalization as norm import sys from sklearn.feature_extraction.text import CountVectorizer import Chunk as ch if len(sys.argv) == 2: No_ch = sys.argv[1] No_ch = int(No_ch) # No_ch = 1 print("Chunk ", sys.argv[1]) # Adquisicion del corpus >>>>>>>> INICIO print("Adquisición de corpus de depresion") dic.chunks_paths = [] dic.loadchunkXML('dpp') dic.analyzeChunk('dpp', No_ch) chunk = ch.Chunk(No_ch) for v in dic.chunks_paths[No_ch - 1]: (uid, posts) = dic.PostForUser(v) chunk.newUser(uid, posts) dic.chunks_paths = [] dic.loadchunkXML('dpn') dic.analyzeChunk('dpn', No_ch) for v in dic.chunks_paths[No_ch - 1]: (uid, posts) = dic.PostForUser(v) chunk.newUser(uid, posts) print('Numero de chunks en types ', len(dic.types['dpp'])) print('Numero de chunks en types ', len(dic.types['dpn']))
p = re.compile(llave, re.IGNORECASE) if re.match(p,token): # print('Match!\tToken: ', token, '\tLidwc: ', llave) fo = fo + 1 for i in lidwc[llave]: lidwc_cats[i] = lidwc_cats[i] + 1 elif token.lower() == llave: # print('Match!\tToken: ', token, '\tLidwc: ', llave) fo = fo + 1 for i in lidwc[llave]: lidwc_cats[i] = lidwc_cats[i] + GD.types['dpn']['chunk'+str(ch)][token] return fo print('Cargando los chuncks\t.\t-\t\\\t|\t/\t-\t.\t1<=>3\n') GD.loadchunkXML('dpn') print('Cargando los diccionarios\t.\t-\t\\\t|\t/\t-\t.\t2<=>3\n') loaddictionary() print('Analizando los tipos:\t.\t-\t\\\t|\t/\t-\t.\t3<=>3\n') print('Chunk\tMatches\tTipos\ttokens') for ch in range(1,11): fo = parseForTypes(ch) FH = open('../dpn_categoriasCHUNK'+str(ch)+'.tsv', 'w') for i in lidwc_cats.keys(): FH.write(str(i)+'\t'+str(lidwc_cats[i])+'\n') FH.close() # pr = fo / len(GD.types['dpn']['chunk3']) # print('Tokens en chunk3:\t', len(GD.types['dpn']['chunk3']), '\nCoincidencias en Liwc:\t', str(fo), '\nRepresentatividad:\t', str(pr)) numTokens = 0 FH = open('../dpn_diccionarioCHUNK'+str(ch)+'.tsv', 'w') for tipo in GD.types['dpn']['chunk'+str(ch)].keys():
def main(No_ch=0): # Adquisicion del corpus >>>>>>>> INICIO print("Adquisición de corpus de depresion") dic.chunks_paths = [] dic.loadchunkXML('dpp') dic.analyzeChunk('dpp', No_ch) dic.chunks_paths = [] dic.loadchunkXML('dpn') dic.analyzeChunk('dpn', No_ch) print('Numero de chunks en types ', len(dic.types['dpp'])) print('Numero de chunks en types ', len(dic.types['dpn'])) dic.initialize_class_types('dp') dic.appendPost('dpp', 'dp') dic.appendPost('dpn', 'dp') print('Numero de instancias en depresion', len(dic.types['dp']['rows'])) dic.types['dp']['cols'] = dic.fillOnesZeros('dp') print('Matriz Y', len(dic.types['dp']['cols'])) dic.types['dp']['names'] = ['Negative', 'Positive'] # Adquisicion del corpus >>>>>>>> FIN # Normalizado del corpus >>>>>>>>>> INICIO norm_train_corpus = norm.normalize_corpus(dic.types['dp']['rows']) # Normalizado del corpus >>>>>>>>>> FIN from feature_extractor import bow_extractor, tfidf_extractor, bow_extractor_maxdf from sklearn.feature_selection import mutual_info_classif import nltk import gensim # BOW features bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) feature_names = bow_vectorizer.get_feature_names() print('Numero de caracteristicas tomadas en cuenta', len(feature_names)) from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() from sklearn.model_selection import cross_val_predict y_predicted = cross_val_predict(nb, bow_train_features, dic.types['dp']['cols'], cv=10) evaluator.get_metrics(dic.types['dp']['cols'], y_predicted) bow_vectorizer, bow_train_features = bow_extractor_maxdf(norm_train_corpus) res = dict( zip( feature_names, mutual_info_classif(bow_train_features, dic.types['dp']['cols'], discrete_features=True))) for feat in res.keys(): print(feat, str(res[feat]), '\n') # y_predicted = cross_val_predict( nb, bow_train_features, dic.types['dp']['cols'], cv=10) # evaluator.get_metrics(dic.types['dp']['cols'], y_predicted) # Adquisicion del corpus >>>>>>>> INICIO print("Adquisición de corpus de anorexia") dic.chunks_paths = [] dic.loadchunkXML('axp') dic.analyzeChunk('axp', No_ch) dic.chunks_paths = [] dic.loadchunkXML('axn') dic.analyzeChunk('axn', No_ch) print('Numero de chunks en types ', len(dic.types['axp'])) print('Numero de chunks en types ', len(dic.types['axn'])) dic.initialize_class_types('ax') dic.appendPost('axp', 'ax') dic.appendPost('axn', 'ax') print('Numero de instancias en anorexia', len(dic.types['ax']['rows'])) dic.types['ax']['cols'] = dic.fillOnesZeros('ax') print('Matriz Y', len(dic.types['ax']['cols'])) dic.types['ax']['names'] = ['Negative', 'Positive'] # Adquisicion del corpus >>>>>>>> FIN # Normalizado del corpus >>>>>>>>>> INICIO norm_train_corpus = norm.normalize_corpus(dic.types['ax']['rows']) # Normalizado del corpus >>>>>>>>>> FIN # BOW features bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) feature_names = bow_vectorizer.get_feature_names() print('Numero de caracteristicas tomadas en cuenta', len(feature_names)) nb = MultinomialNB() y_predicted = cross_val_predict(nb, bow_train_features, dic.types['ax']['cols'], cv=10) evaluator.get_metrics(dic.types['ax']['cols'], y_predicted) bow_vectorizer, bow_train_features = bow_extractor_maxdf(norm_train_corpus) res = dict( zip( feature_names, mutual_info_classif(bow_train_features, dic.types['ax']['cols'], discrete_features=True))) for feat in res.keys(): print(feat, str(res[feat]), '\n')
lidwc = {} lidwc_cats = {} clases = ['axp','axn','dpp','dpn'] # clases = ['axp'] print("Cargando Lidwc: \tdiccionario\t...(1,5)\n") lidwc = LDW.getWORDS() print("Cargando Lidwc: \tcategorias\t...(2,5)\n") lidwc_cats = LDW.getNUM() print("Analisis de clases\t\t...(3,5)\n") for clase in clases: GD.chunks_paths = [] print('Cargando los chunks de ', clase, '\t\t...(4,5)\n') FH = open('../ForUserAnalysis/'+clase+'.tsv', 'a') FH.write('usuario\tchunk\tposts\tliwc\tvocabulary\ttokens\n') GD.loadchunkXML(clase) print('Estadisticas base por xml\t\t...(5,5)\n') for chunk in GD.chunks_paths: for xml in chunk: m = re.search('subject[0-9]+_([0-9]{1,2})\.xml',xml) (usuario, no_post, diccionario) = GD.typesforUser(xml) fo = parseForTypes(usuario, diccionario) CFH = open('../ForUserAnalysis/'+clase+'_categoriasCHUNK'+m.group(1)+'_'+usuario+'.tsv', 'w') for i in lidwc_cats.keys(): CFH.write(str(i)+'\t'+str(lidwc_cats[i])+'\n') CFH.close() numTokens = 0 LFH = open('../ForUserAnalysis/'+clase+'_diccionarioCHUNK'+m.group(1)+'_'+usuario+'.tsv', 'w') for tipo in diccionario.keys(): numTokens = numTokens + diccionario[tipo] LFH.write(tipo + '\t' + str(diccionario[tipo])+'\n')