listDocOrg = fTxtOrg.readlines() print 'len(listDocOrg): ', len(listDocOrg) for rowOfListDocOrg in listDocOrg: # print 'rowOfListDocOrg: ', rowOfListDocOrg # myResult = p.search(rowOfListDocOrg) # if myResult <> None: # myData = re.sub('^Title: |^Abstract: ','',myResult.group()) # outf.write(myData) outf.write(rowOfListDocOrg) listMyWords.extend(rowOfListDocOrg.split()) # listDoc.append((myData.split(),fileOne[9:11])) print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split()) listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff)) idxCrossValidation = 0 for listTrainWithDiff, listValidationWithDiff in k_fold_cross_validation(listDoc, numFold, randomize = True): # outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per' outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation) with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Train-'+'.txt', 'wb') as outf2: for oneRowOfListTrainWithDiff in listTrainWithDiff: # listAllDocWords.extend(oneRowOfListTrainWithDiff[0]) outf2.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n') with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Test-'+'.txt', 'wb') as outf3: for oneRowOflistValidationWithDiff in listValidationWithDiff: print 'oneRowOflistValidationWithDiff: ', oneRowOflistValidationWithDiff outf3.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n') idxCrossValidation = idxCrossValidation + 1
def fCreadeCrossValidationFiles(numFold): # numFold = 10 listMyType = ['stp-', 'wnl-', 'ptr-'] #typeTextPreprocess = 'stp-' typeTextPreprocess = 'wnl-' #typeTextPreprocess = 'ptr-' myRe = '((^Title: |^Abstract: )(.*))' p = re.compile(myRe) # dirMain = '' # dirMain = os.path.expanduser('~')+'/' # '/home/kimiko' dirMain = os.path.expanduser('~')+'/' + 'Data/TestDir/' # '/home/kimiko' logging.info("dirMain = os.path.expanduser('~')+'/': " + dirMain) dirInput = 'Output1/' dirOutputTest = 'Output2_TestingSet/' logging.info("dirOutputTest: " + dirOutputTest) dirOutputMergeFile = 'Output2_Merge/' dirOutputTrain = 'Output2_TrainingSet/' #filesInput = ['pure-doc-dx.txt', 'pure-doc-tx.txt'] #filesInput = ['intervention.txt', 'patient.txt', 'outcome.txt'] ListInputFilenameTxt = [] # ListInputFilenameTxt = ['intervention.txt', 'patient.txt', 'outcome.txt'] ListInputFilenameTxtTmp = os.listdir(dirMain + dirInput) for itemOfListInputFilenameTxtTmp in ListInputFilenameTxtTmp: statinfo = os.stat(dirMain + dirInput + itemOfListInputFilenameTxtTmp) if statinfo.st_size > numFold*1500: ListInputFilenameTxt.append(itemOfListInputFilenameTxtTmp) else: os.remove(dirMain + dirInput + itemOfListInputFilenameTxtTmp) # ListInputFilenameTxt = os.listdir(dirMain + dirInput) # print "Unique Combinations of 2 letters from :",ListInputFil?enameTxt logging.info("Unique Combinations of 2 letters from: " + ', '.join(ListInputFilenameTxt)) # exit() #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc) #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] # ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] # ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # ] #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] #filesInput = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] #for typeTextPreprocess in listMyType: # dirMain = os.getcwd()+'/' logging.info("dirMain + dirOutputTest: " + dirMain + dirOutputTest) if os.path.isdir(dirMain + dirOutputTest): try: shutil.rmtree(dirMain+dirOutputTest) # os.mkdir(dirMain + dirOutputTest) except: raise os.mkdir(dirMain + dirOutputTest) logging.info("dirMain + dirOutputMergeFile: " + dirMain + dirOutputMergeFile) if os.path.isdir(dirMain + dirOutputMergeFile): try: shutil.rmtree(dirMain+dirOutputMergeFile) except: raise os.mkdir(dirMain + dirOutputMergeFile) logging.info("dirMain + dirOutputTrain: " + dirMain + dirOutputTrain) if os.path.isdir(dirMain + dirOutputTrain): try: shutil.rmtree(dirMain+dirOutputTrain) except: raise os.mkdir(dirMain + dirOutputTrain) # exit() for fileOne in ListInputFilenameTxt: # outputFileNameDiff = fileOne[0:3] outputFileNameDiff = fileOne[0:-4] # print 'outputFileNameDiff: ', outputFileNameDiff logging.info('outputFileNameDiff: '+ outputFileNameDiff) listMyWords = [] listDoc = [] logging.info(dirMain+dirOutputTest+typeTextPreprocess+outputFileNameDiff+'.csv') with open(dirMain+dirOutputTest+typeTextPreprocess+outputFileNameDiff+'.csv', 'wb') as outf: # filePioTxt= dirMain+dirInput+typeTextPreprocess+fileOne filePioTxt= dirMain+dirInput + fileOne with open(filePioTxt) as fTxtOrg: listDocOrg = fTxtOrg.readlines() # print 'len(listDocOrg): ', len(listDocOrg) logging.info('len(listDocOrg): '+ str(len(listDocOrg))) for rowOfListDocOrg in listDocOrg: # print 'rowOfListDocOrg: ', rowOfListDocOrg # myResult = p.search(rowOfListDocOrg) # if myResult <> None: # myData = re.sub('^Title: |^Abstract: ','',myResult.group()) # outf.write(myData) outf.write(rowOfListDocOrg) listMyWords.extend(rowOfListDocOrg.split()) # listDoc.append((myData.split(),fileOne[9:11])) # print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split()) logging.debug('(rowOfListDocOrg.split(),outputFileNameDiff): '+ outputFileNameDiff + " - "+ str(rowOfListDocOrg.split())) listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff)) # exit() idxCrossValidation = 0 # def k_fold_cross_validation(X, K, randomise = False): # for listTrainWithDiff, listValidationWithDiff in k_fold_cross_validation(listDoc, numFold, randomize = True): for listTrainWithDiff, listValidationWithDiff in k_fold_cross_validation(listDoc, numFold, True): # outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per' outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation) with open(dirMain+dirOutputMergeFile+typeTextPreprocess+'-'+str(idxCrossValidation)+'-Train-'+'.csv', 'a') as outfFullTrain: with open(dirMain+dirOutputMergeFile+typeTextPreprocess+'-'+str(idxCrossValidation)+'-Test-'+'.csv', 'a') as outfFullTest: with open(dirMain+dirOutputTrain+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Train-'+'.csv', 'wb') as outf2: for oneRowOfListTrainWithDiff in listTrainWithDiff: # listAllDocWords.extend(oneRowOfListTrainWithDiff[0]) outf2.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n') outfFullTrain.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n') with open(dirMain+dirOutputTest+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Test-'+'.csv', 'wb') as outf3: for oneRowOflistValidationWithDiff in listValidationWithDiff: # print 'oneRowOflistValidationWithDiff: ', oneRowOflistValidationWithDiff # print 'type(oneRowOflistValidationWithDiff): ', type(oneRowOflistValidationWithDiff) # exit() logging.debug('oneRowOflistValidationWithDiff: ' + str(oneRowOflistValidationWithDiff)) outf3.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n') outfFullTest.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n') idxCrossValidation = idxCrossValidation + 1
es_entrenamiento = numpy.asarray(es_entrenamiento) archivo = numpy.concatenate((datos,es_entrenamiento[numpy.newaxis, :].T), axis=1) #Verificamos el tipo de CV solicitado y ejecutamos el CV correspondiente, #CV hay q ejecutarlo 3 veces, uno para predicciones de 1R, otro para predicciones 2R y otro para 2R_1R #Se hace kfold if options.kf == True: validation_k = int(options.kfolds) #Se aplica cv para predecir 1r respuestas, fold_error_t, fold_error_v, final_error_t, final_error_v = crossValidation.k_fold_cross_validation(validation_k, test_percentage, datos1r, options, "1r") print("Ronda 1") print("FoldAccuracyT", 100 - fold_error_t) print("FoldAccuracyV", 100 -fold_error_v) print("FinalAccuracyT",100 - final_error_t) print("FinalAccuracyV",100 - final_error_v) print("--------------------------------------") respuestas = numpy.asarray(respuestas) archivo = numpy.concatenate((archivo,respuestas[numpy.newaxis, :].T), axis=1) #Se aplica cv para predecir 2r respuestas, fold_error_t, fold_error_v, final_error_t, final_error_v = crossValidation.k_fold_cross_validation(validation_k, test_percentage, datos2r, options, "2r") print("Ronda 2") print("FoldAccuracyT", 100 - fold_error_t) print("FoldAccuracyV", 100 - fold_error_v) print("FinalAccuracyT",100 - final_error_t)