def __init__(self, path_of_images, args):
        self.args = args

        # ================================================================================
        paths_image, self.num_of_imgs = utils_common.return_path_list_from_txt(
            path_of_images)

        # ================================================================================
        img_and_class_pair = utils_data_process.create_img_path_and_class_pairs(
            paths_image=paths_image)

        # ================================================================================
        self.zipped_one = img_and_class_pair

        shuffle(self.zipped_one)
Example #2
0
def detect_img_less_than_256(txt_file_containing_paths_of_img):
    """
  Act
    * 
  
  Params
    * txt_file_containing_paths_of_img
    "/mnt/1T-5e7/image/whole_dataset/text_for_colab/real/temp/bigtime_trn.txt"
  
  Return
    * have_small_img
    If 0, you have no small image
    * small_lens
    [199,200] means you have images which has 199 or 200 lengths
  """
    p_l_full, num_img = utils_common.return_path_list_from_txt(
        txt_file_containing_paths_of_img)
    # print("num_img",num_img)
    # 8400

    # c col_i_szs: collection of image sizes
    col_i_szs = []
    # c col_i_pa_szs: collection of image paths and sizes
    col_i_pa_szs = []
    for one_path in p_l_full:
        one_path = one_path.replace("\n", "")
        one_lo_img = utils_image.load_img(one_path)

        col_i_szs.append(one_lo_img.shape[:2])
        col_i_pa_szs.append([one_path, one_lo_img.shape[:2]])

    col_i_szs = list(set(col_i_szs))

    col_i_szs_np = np.array(col_i_szs)

    # c have_small_img: you have small images?
    have_small_img = np.sum(col_i_szs_np < 256)
    # print("have_small_img",have_small_img)

    # c small_lens: small length
    small_lens = col_i_szs_np[col_i_szs_np < 256]

    return have_small_img, small_lens
Example #3
0
def calculate(dir_Docs,dir_Vocab,args):
  dir_Docs_files=utils_common.get_file_list(dir_Docs)
  dir_Vocab_files=utils_common.get_file_list(dir_Vocab)
  # print("dir_Docs_files",dir_Docs_files)
  # print("dir_Vocab_files",dir_Vocab_files)
  # ['/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/Doc1.txt', '/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/Doc2.txt', '/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/Doc3.txt']
  # ['/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Vocab/Vocab_side_effect.txt', '/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Vocab/Vocab_symptom.txt']

  # ================================================================================
  stop_words=set(stopwords.words('english'))
  wnl=WordNetLemmatizer()

  # ================================================================================
  start=timeit.default_timer()

  all_docs=[]
  for one_doc in dir_Docs_files:
    doc_contents,num_lines=utils_common.return_path_list_from_txt(one_doc)
    doc_contents=list(map(lambda one_line:one_line.replace("\n",""),doc_contents))
    # print("doc_contents",doc_contents)
    # ['I have weight loss.', "I'm thirsty.", 'What is the problem?']

    doc_str=" ".join(doc_contents)
    # print("doc_str",doc_str)
    # I have weight loss. I'm thirsty. What is the problem?

    tokenizer=RegexpTokenizer(r'\w+')
    doc_str_wo_punctu=tokenizer.tokenize(doc_str)
    # print("doc_str_wo_punctu",doc_str_wo_punctu)
    # ['I', 'have', 'weight', 'loss', 'I', 'm', 'thirsty', 'What', 'is', 'the', 'problem']

    doc_str_wo_punctu=" ".join(doc_str_wo_punctu)

    word_tokens=word_tokenize(doc_str_wo_punctu)
    # print("word_tokens",word_tokens)
    # ['I', 'have', 'weight', 'loss', 'I', 'm', 'thirsty', 'What', 'is', 'the', 'problem']

    result=[]
    for w in word_tokens: 
      if w not in stop_words:
        result.append(w)

    # print("result",result)
    # ['I', 'weight', 'loss', 'I', 'thirsty', 'What', 'problem']

    sent=" ".join(result)
    # print("sent",sent)
    # I weight loss I thirsty What problem

    sent_after_lemma=" ".join([wnl.lemmatize(i) for i in sent.split()])
    # print("sent_after_lemma",sent_after_lemma)
    # I weight loss I thirsty What problem

    # ================================================================================
    doc_str=" ".join(result)
    # print("doc_str",doc_str)
    # I weight loss I thirsty What problem

    all_docs.append(doc_str)

  # ================================================================================
  vocab_files=["side_effect","symptom"]
  result=[]
  for vocab_file_idx,one_vocab_file in enumerate(dir_Vocab_files):
    one_vocab_contents,num_vocab=utils_common.return_path_list_from_txt(one_vocab_file)
    # print("one_vocab_contents",one_vocab_contents)
    # ['vomit\n', 'dizzy\n', 'diarrhea\n', 'fever\n', 'stomach gas']

    one_vocab_contents=list(map(lambda one_word:one_word.replace("\n",""),one_vocab_contents))
    # print("one_vocab_contents",one_vocab_contents)
    # ['vomit', 'dizzy', 'diarrhea', 'fever', 'stomach gas']

    one_vocab_contents=list(map(lambda one_word:wnl.lemmatize(one_word),one_vocab_contents))
    # print("one_vocab_contents",one_vocab_contents)

    # ================================================================================
    cnt_num_init=0
    for doc_idx,one_doc in enumerate(all_docs):
      # print("one_doc",one_doc)
      # I have weight loss. I'm thirsty. What is the problem?

      for one_word in one_vocab_contents:
        cnt_num=one_doc.count(one_word)
        cnt_num_init=cnt_num_init+cnt_num
        # print("cnt_num_init",cnt_num_init)
      result.append([vocab_files[vocab_file_idx],doc_idx+1,cnt_num_init])
      cnt_num_init=0

  # print("result",result)
  # Doc1: symptom, Doc2: side_effect, Doc3: symptom
  # [['side_effect', 1, 0], ['side_effect', 2, 3], ['side_effect', 3, 0], ['symptom', 1, 2], ['symptom', 2, 0], ['symptom', 3, 2]]

  stop=timeit.default_timer()
  took_time_sec=stop-start
  took_time_min=str(datetime.timedelta(seconds=took_time_sec))
Example #4
0
def color_word(dir_Docs,dir_Vocab,args):
  dir_Docs_files=utils_common.get_file_list(dir_Docs)
  dir_Vocab_files=utils_common.get_file_list(dir_Vocab)
  # print("dir_Docs_files",dir_Docs_files)
  # print("dir_Vocab_files",dir_Vocab_files)
  # ['/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/Doc1.txt', '/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/Doc2.txt', 
  #  '/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/Doc3.txt']
  # ['/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Vocab/Vocab_side_effect.txt', 
  #  '/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Vocab/Vocab_symptom.txt']

  # ================================================================================
  stop_words=set(stopwords.words('english'))
  wnl=WordNetLemmatizer()

  # ================================================================================
  start=timeit.default_timer()

  all_docs=[]
  for one_doc in dir_Docs_files:
    doc_contents,num_lines=utils_common.return_path_list_from_txt(one_doc)
    doc_contents=list(map(lambda one_line:one_line.replace("\n",""),doc_contents))
    # print("doc_contents",doc_contents)
    # ['I have weight loss.', "I'm thirsty.", 'What is the problem?']

    doc_str=" ".join(doc_contents)
    # print("doc_str",doc_str)
    # I have weight loss. I'm thirsty. What is the problem?

    tokenizer=RegexpTokenizer(r'\w+')
    doc_str_wo_punctu=tokenizer.tokenize(doc_str)
    # print("doc_str_wo_punctu",doc_str_wo_punctu)
    # ['I', 'have', 'weight', 'loss', 'I', 'm', 'thirsty', 'What', 'is', 'the', 'problem']

    doc_str_wo_punctu=" ".join(doc_str_wo_punctu)

    word_tokens=word_tokenize(doc_str_wo_punctu)
    # print("word_tokens",word_tokens)
    # ['I', 'have', 'weight', 'loss', 'I', 'm', 'thirsty', 'What', 'is', 'the', 'problem']

    result=[]
    for w in word_tokens: 
      if w not in stop_words:
        result.append(w)

    # print("result",result)
    # ['I', 'weight', 'loss', 'I', 'thirsty', 'What', 'problem']

    sent=" ".join(result)
    # print("sent",sent)
    # I weight loss I thirsty What problem

    sent_after_lemma=" ".join([wnl.lemmatize(i) for i in sent.split()])
    # print("sent_after_lemma",sent_after_lemma)
    # I weight loss I thirsty What problem

    # ================================================================================
    doc_str=" ".join(result)
    # print("doc_str",doc_str)
    # I weight loss I thirsty What problem

    all_docs.append(doc_str)

  # ================================================================================
  all_docs_color=[]
  vocab_files=["side_effect","symptom"]
  vocab_files_to_color=["red","blue"]
  result=[]
  vocab_after_proc=[]
  for vocab_file_idx,one_vocab_file in enumerate(dir_Vocab_files):
    one_vocab_contents,num_vocab=utils_common.return_path_list_from_txt(one_vocab_file)
    # print("one_vocab_contents",one_vocab_contents)
    # ['vomit\n', 'dizzy\n', 'diarrhea\n', 'fever\n', 'stomach gas']

    one_vocab_contents=list(map(lambda one_word:one_word.replace("\n",""),one_vocab_contents))
    # print("one_vocab_contents",one_vocab_contents)
    # ['vomit', 'dizzy', 'diarrhea', 'fever', 'stomach gas']

    one_vocab_contents=list(map(lambda one_word:wnl.lemmatize(one_word),one_vocab_contents))
    # print("one_vocab_contents",one_vocab_contents)

    vocab_after_proc.append(one_vocab_contents)
  
  # print("vocab_after_proc",vocab_after_proc)
  # [['vomit', 'dizzy', 'diarrhea', 'fever', 'stomach gas'], ['weight loss', 'pee', 'thirsty', 'headache', 'hair loss']]

  # ================================================================================
  all_docs_colored=[]
  all_docs_blue_colored=[]
  cnt_num_init=0
  for doc_idx,one_doc in enumerate(all_docs):

    temp_red_str=[]
    first=True
    for one_word in vocab_after_proc[0]:
      tag_red='<font color="red">'+one_word+'</font>'

      if first==True:
        replaced_one=one_doc.replace(one_word,tag_red)
        # print("replaced_one",replaced_one)
        # I weight loss I thirsty What problem

        temp_red_str.append(replaced_one)
        first=False
      else:
        replaced_one=temp_red_str[0].replace(one_word,tag_red)
        # print("replaced_one",replaced_one)
        # I weight loss I thirsty What problem

        temp_red_str.pop(0)
        
        temp_red_str.append(replaced_one)
        # print("replaced_one",replaced_one)
    # print("temp_red_str",temp_red_str)
    all_docs_colored.append(temp_red_str[0])
    temp_red_str.pop(0)

    # ================================================================================
    temp_blue_str=[]
    first=True
    for one_word in vocab_after_proc[1]:
      tag_red='<font color="blue">'+one_word+'</font>'

      if first==True:
        replaced_one=one_doc.replace(one_word,tag_red)
        # print("replaced_one",replaced_one)
        # I weight loss I thirsty What problem

        temp_blue_str.append(replaced_one)
        first=False
      else:
        replaced_one=temp_blue_str[0].replace(one_word,tag_red)
        # print("replaced_one",replaced_one)
        # I weight loss I thirsty What problem

        temp_blue_str.pop(0)
        
        temp_blue_str.append(replaced_one)
        # print("replaced_one",replaced_one)
    # print("temp_blue_str",temp_blue_str)
    all_docs_blue_colored.append(temp_blue_str[0])
    temp_blue_str.pop(0)

  # ================================================================================
  all_docs_colored=list(map(lambda one_el:one_el+"<br>",all_docs_colored))
  all_docs_blue_colored=list(map(lambda one_el:one_el+"<br>",all_docs_blue_colored))

  # print("all_docs_colored",all_docs_colored)
  # print("all_docs_blue_colored",all_docs_blue_colored)
  # ['I weight loss I thirsty What problem<br>', 
  #  'I become <font color="red">dizzy</font> Meformin And I also <font color="red">diarrhea</font> <font color="red">stomach gas</font> Is side effect<br>', 
  #  'I head ache And I much pee weight loss symptom What<br>']

  # ['I <font color="blue">weight loss</font> I <font color="blue">thirsty</font> What problem<br>', 
  #  'I become dizzy Meformin And I also diarrhea stomach gas Is side effect<br>', 
  #  'I head ache And I much <font color="blue">pee</font> <font color="blue">weight loss</font> symptom What<br>']

  afaf
def train(args):
  k_fold=3
  epoch=int(args.epoch)
  batch_size=int(args.batch_size)
  # print("epoch",epoch)
  # print("batch_size",batch_size)
  # 9
  # 2
  
  # ================================================================================
  xgb=XGBClassifier(n_estimators=100)

  # ================================================================================
  text_file_instance=text_file_path_api_module.Path_Of_Text_Files(args)

  txt_of_train_data=text_file_instance.train_data
  # print("txt_of_train_data",txt_of_train_data)
  # /mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train_csv_path.txt

  # ================================================================================
  contents_of_txt,num_line=utils_common.return_path_list_from_txt(txt_of_train_data)
  # print("contents_of_txt",contents_of_txt)
  # ['/mnt/1T-5e7/mycodehtml/prac_data_science/kaggle/hivprogression/My_code/Data/training_data.csv']
  
  # ================================================================================

  train_data_wo_id_df=utils_data.load_HIV_csv_data(contents_of_txt[0])
  # print(train_data_wo_id_df.columns)
  # Index(['Resp', 'PR Seq', 'RT Seq', 'VL-t0', 'CD4-t0'], dtype='object')

  # print("train_data_wo_id_df",train_data_wo_id_df.shape)
  # (920, 5)

  # ================================================================================
  # freq_of_char_B,freq_of_char_H=utils_data.count_num_seq_containing_B_or_H(train_data_wo_id_df)
  # print("freq_of_char_B",freq_of_char_B)
  # print("freq_of_char_H",freq_of_char_H)
  # 1
  # 3

  # ================================================================================
  # B_mask_idx,H_mask_idx=utils_data.get_indices_containing_B_or_H(train_data_wo_id_df)
  # print("B_mask_idx",B_mask_idx)
  # print("H_mask_idx",H_mask_idx)
  # [25]
  # [43, 199, 843]

  # ================================================================================
  # train_data_wo_id_df=train_data_wo_id_df.drop(train_data_wo_id_df.index[[25,43,199,843]])
  # print("train_data_wo_id_df",train_data_wo_id_df.shape)
  # (916, 5)

  train_data_wo_id_df=train_data_wo_id_df.iloc[:-1,:]
 
  # ================================================================================
  # print("train_data_wo_id_df.columns",train_data_wo_id_df.columns)
  # Index(['Resp', 'PR Seq', 'RT Seq', 'VL-t0', 'CD4-t0'], dtype='object')

  # ================================================================================
  # @ Length match in DNA sequence string

  # PR_Seq_old=train_data_wo_id_df.iloc[:,1]
  # RT_Seq_old=train_data_wo_id_df.iloc[:,2]

  # PR_Seq=utils_data.length_match_for_PR_Seq(PR_Seq_old)
  # # print("PR_Seq",PR_Seq)
  # # ['CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAGGGGGGCAACTAAAGGAAGCYCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGGAGTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAARACAGTATGATCAGRTACCCATAGAAATCTATGGACATAAAGCTGTAGGTACAGTATTAATAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTY', 

  # RT_Seq=utils_data.length_match_for_RT_Seq(RT_Seq_old)
  # # print("RT_Seq",RT_Seq)
  # # ['CCCATTAGTCCTATTGAAACTGTACCAGTAAAGCTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGYACAGAAATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATATAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTACATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAAYTAGGAATACCACATCCCGCWGGGTTAAAAAAGAAYAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTCTCAGTTCCMTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACGAAATCCAGACATAGTTATCTACCAATACATGGATGATTTGTATGTAGGATCTGATTTRGAAATAGAACAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTCAAGGTGGGGGTTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGCTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTTCTGCCAGAAAAAGATAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAGTTGAATTGGGCAAGTCAGATTTAYGCAGGGATTAAAGTAAAGCAATTATGTAAACTCCTTAGGGGGACCAAGKCACTAACAGAAATAATACCACTAACAAGAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAAATTCTAAAAGAACCAGTACATGGAGTGTATTATGATCCAACAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGC000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000', 

  # ================================================================================
  # @ Replace old with new

  # train_data_wo_id_df.iloc[:,1]=0
  # train_data_wo_id_df.iloc[:,1]=PR_Seq

  # train_data_wo_id_df.iloc[:,2]=0
  # train_data_wo_id_df.iloc[:,2]=RT_Seq

  # print("train_data_wo_id_df",train_data_wo_id_df.head(2))
  #    Resp  \
  # 0  0      
  # 1  0      

  #                                                                                                                                                                                                                                                                                                       PR Seq  \
  # 0  CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAGGGGGGCAACTAAAGGAAGCYCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGGAGTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAARACAGTATGATCAGRTACCCATAGAAATCTATGGACATAAAGCTGTAGGTACAGTATTAATAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTY   
  # 1  CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGGAATTGCCAGGAAGATGGAAACCAAAAATAATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACCCATAGAAATCTGTGGACATAAAGTTATAAGTACAGTATTAATAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGATGACTCAGCTTGGTTGCACTTTAAATTTT   

  #                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       RT Seq  \
  # 0  CCCATTAGTCCTATTGAAACTGTACCAGTAAAGCTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGYACAGAAATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATATAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTACATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAAYTAGGAATACCACATCCCGCWGGGTTAAAAAAGAAYAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTCTCAGTTCCMTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACGAAATCCAGACATAGTTATCTACCAATACATGGATGATTTGTATGTAGGATCTGATTTRGAAATAGAACAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTCAAGGTGGGGGTTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGCTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTTCTGCCAGAAAAAGATAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAGTTGAATTGGGCAAGTCAGATTTAYGCAGGGATTAAAGTAAAGCAATTATGTAAACTCCTTAGGGGGACCAAGKCACTAACAGAAATAATACCACTAACAAGAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAAATTCTAAAAGAACCAGTACATGGAGTGTATTATGATCCAACAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGC000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000   
  # 1  CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGARATGGAAARGGARGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCGGTATTTGTCATAAAGAAAAAGGACAGTACTAAGTGGAGAAAAGTAGTAGATTTCAGAGAACTTAATAAAAGAACTCAAGACTTCTGGGAAGTTCAATTAGGGATACCACATCCCGCAGGGWTAAAAAAGAATAAATCAGTAACAGTATTGGATGTGGGTGATGCATACTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCCAGTACAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATTTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACGTGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGCTGAAGTGGGGATTGACCACACCAGACAAAAAAYATCAGAAAGAACCTCCATTTCGTTGGATGGGTTATGAACTCCATCCTGATAAMTGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGAAAATTAAATTGGGCAAGCCAGATTTACGCAGGGATTAAAGTAAAGCAATTATGTAAACTCCTTAGGGGAACCAAGGCACTAACWGATGTAATACCACTAACAAGAGAAGCAGAGCTAGAACTG000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000   

  #   VL-t0  CD4-t0  
  # 0  4.3    145     
  # 1  3.6    224     

  # ================================================================================
  train_k,vali_k=utils_data.get_k_folds(train_data_wo_id_df)
  # print("train_k",train_k)
  # [array([[0,
  #       'CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAGGGGGGCAACTAAAGGAAGCYCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGGAGTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAARACAGTATGATCAGRTACCCATAGAAATCTATGGACATAAAGCTGTAGGTACAGTATTAATAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTY',
  #       'CCCATTAGTCCTATTGAAACTGTACCAGTAAAGCTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGYACAGAAATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATATAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTACATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAAYTAGGAATACCACATCCCGCWGGGTTAAAAAAGAAYAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTCTCAGTTCCMTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACGAAATCCAGACATAGTTATCTACCAATACATGGATGATTTGTATGTAGGATCTGATTTRGAAATAGAACAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTCAAGGTGGGGGTTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGCTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTTCTGCCAGAAAAAGATAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAGTTGAATTGGGCAAGTCAGATTTAYGCAGGGATTAAAGTAAAGCAATTATGTAAACTCCTTAGGGGGACCAAGKCACTAACAGAAATAATACCACTAACAAGAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAAATTCTAAAAGAACCAGTACATGGAGTGTATTATGATCCAACAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGC',
  #       4.3, 145],

  # ================================================================================
  # c loss_list: list which will stores loss values to plot loss
  loss_list=[]
  f1_score_list=[]

  # ================================================================================
  if args.task_mode=="train": # If you're in train mode
    
    # ================================================================================
    # Iterate K folds    
    for one_k in range(k_fold):
      single_train_k=train_k[one_k]

      # ================================================================================
      single_trn_data_k=single_train_k[:,1:]
      # print("single_trn_data_k",single_trn_data_k)
      # [['CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAGGGGGGCAACTAAAGGAAGCYCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGGAGTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAARACAGTATGATCAGRTACCCATAGAAATCTATGGACATAAAGCTGTAGGTACAGTATTAATAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTY'
      #   'CCCATTAGTCCTATTGAAACTGTACCAGTAAAGCTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGYACAGAAATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATATAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTACATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAAYTAGGAATACCACATCCCGCWGGGTTAAAAAAGAAYAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTCTCAGTTCCMTTAGATAAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACGAAATCCAGACATAGTTATCTACCAATACATGGATGATTTGTATGTAGGATCTGATTTRGAAATAGAACAGCATAGAACAAAAATAGAGGAACTGAGACAACATCTGTCAAGGTGGGGGTTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGCTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTTCTGCCAGAAAAAGATAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAGTTGAATTGGGCAAGTCAGATTTAYGCAGGGATTAAAGTAAAGCAATTATGTAAACTCCTTAGGGGGACCAAGKCACTAACAGAAATAATACCACTAACAAGAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAAATTCTAAAAGAACCAGTACATGGAGTGTATTATGATCCAACAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGC'
      #   4.3 145]
      
      single_trn_lbl_k=single_train_k[:,0]
      # print("single_trn_lbl_k",single_trn_lbl_k)
      # [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

      # ================================================================================
      PR_Seq=single_trn_data_k[:,0]
      # print("PR_Seq",PR_Seq)
      # ['CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAGGGGGGCAACTAAAGGAAGCYCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGGAGTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAARACAGTATGATCAGRTACCCATAGAAATCTATGGACATAAAGCTGTAGGTACAGTATTAATAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGCTTGGTTGCACTTTAAATTTY'

      PR_Seq_converted=utils_data.process_PR_Seq(PR_Seq)
      # print("PR_Seq_converted",PR_Seq_converted)
      # [[3, 3, 20, 3, 1, 1, 1, 20, 3, 1, 3, 20, 3, 20, 20, 20, 7, 7, 3, 1, 1, 3, 7, 1, 3, 3, 3, 3, 20, 3, 7, 20, 3, 3, 3, 1, 1, 20, 1, 1, 7, 7, 1, 20, 1, 7, 7, 7, 7, 7, 7, 3, 1, 1, 3, 20, 1, 1, 1, 7, 

      PR_Seq_converted_df=pd.DataFrame(PR_Seq_converted)
      # print("PR_Seq_converted_df",PR_Seq_converted_df.shape)
      # (612, 297)

      PR_Seq_converted_df=PR_Seq_converted_df.fillna(value=0)
      # print("PR_Seq_converted_df",PR_Seq_converted_df)
      # print("PR_Seq_converted_df",PR_Seq_converted_df.shape)
      # (612, 297)

      PR_Seq_converted_np=np.array(PR_Seq_converted_df)
      # print("PR_Seq_converted_np",PR_Seq_converted_np)
      
      lack=320-PR_Seq_converted_np.shape[1]

      PR_Seq_converted_np=np.pad(PR_Seq_converted_np,((0,0),(0,lack)),'constant')
      # print("PR_Seq_converted_np",PR_Seq_converted_np)
      # print("PR_Seq_converted_np",PR_Seq_converted_np.shape)
      # (612, 320)

      # ================================================================================
      RT_Seq=single_trn_data_k[:,1]

      RT_Seq_converted=utils_data.process_RT_Seq(RT_Seq)
      # print("RT_Seq_converted",RT_Seq_converted)
      # [[3, 3, 3, 1, 20, 20, 1, 7, 20, 3, 3, 20, 1, 20, 20, 7, 1, 1, 1, 3, 20, 7, 20, 1, 3, 3, 1, 7, 20, 1, 1, 1, 7, 3, 20, 1, 1, 1, 7, 3, 3, 1, 7, 7, 1, 1, 20, 7, 7, 1, 20, 7, 7, 3, 3, 3, 1, 1, 1, 1,

      RT_Seq_converted_df=pd.DataFrame(RT_Seq_converted)
      # print("RT_Seq_converted_df",RT_Seq_converted_df.shape)
      # (612, 1479)

      RT_Seq_converted_df=RT_Seq_converted_df.fillna(value=0)
      # print("RT_Seq_converted_df",RT_Seq_converted_df)
      # print("RT_Seq_converted_df",RT_Seq_converted_df.shape)
      # (612, 1479)

      RT_Seq_converted_np=np.array(RT_Seq_converted_df)
      # print("RT_Seq_converted_np",RT_Seq_converted_np)
      
      lack=1600-RT_Seq_converted_np.shape[1]

      RT_Seq_converted_np=np.pad(RT_Seq_converted_np,((0,0),(0,lack)),'constant')

      # ================================================================================
      VL=single_trn_data_k[:,2]
      # norm_VL=utils_common.normalize_1D_arr(VL)
      # print("VL",VL)
      # [4.3 5.7 3.5 3.4 4.4 4.1 5.9 4.6 3.3 4.7 4.1 5.2 3.4 3.5 4.0 3.7 3.5 5.7

      # print("norm_VL",norm_VL.shape)
      # (613,)

      VL=VL[:,np.newaxis]
      VL_np=np.array(VL)
      # print("VL_np",VL_np)
      # [[4.3]
      #  [5.7]
      # print("VL_np",VL_np.shape)
      # (612, 1)
      
      # ================================================================================
      CD4=single_trn_data_k[:,3]
      # print("CD4",CD4)
      # [145 206 572 221 384 184 199 247 155 115 414 242 349 308 325 316 117 109
      
      # plt.subplot(1,2,1)
      # plt.plot(CD4)
      # plt.title('Before normalizing CD4')

      # norm_CD4=utils_common.normalize_1D_arr(CD4)
      # plt.subplot(1,2,2)
      # plt.plot(norm_CD4)
      # plt.title('After normalizing CD4')
      # plt.show()
      
      # print("norm_CD4",norm_CD4.shape)
      # (613,)

      CD4=CD4[:,np.newaxis]
      # print("CD4",CD4)

      CD4_np=np.array(CD4).astype("float16")
      # print("CD4_np",CD4_np)
      # [[145]
      #  [206]
      # print("CD4_np",CD4_np.shape)
      # (612, 1)

      # ================================================================================
      # print("PR_Seq_converted_np",PR_Seq_converted_np.shape)
      # # (612, 320)
      # print("RT_Seq_converted_np",RT_Seq_converted_np.shape)
      # # (612, 1600)
      # print("VL_npVL",VL_np.shape)
      # # (612, 1)
      # print("CD4_np",CD4_np.shape)
      # # (612, 1)

      # print("PR_Seq_converted_np",PR_Seq_converted_np[0,:])
      # # (612, 320)
      # print("RT_Seq_converted_np",RT_Seq_converted_np[0,:])
      # # (612, 1600)
      # print("VL_npVL",VL_np[0,0])
      # # (612, 1)
      # print("CD4_np",CD4_np[0,0])
      # # (612, 1)

      # ================================================================================
      final_trn_data=np.hstack((PR_Seq_converted_np,RT_Seq_converted_np,VL_np,CD4_np))
      # print("final_trn_data",final_trn_data.shape)
      # (612, 1922)

      single_trn_lbl_k_np=np.array(single_trn_lbl_k).astype('float16')
      # print("single_trn_lbl_k_np",single_trn_lbl_k_np.shape)
      # (612,)

      xgb.fit(final_trn_data,single_trn_lbl_k_np)

    # ================================================================================
    # End of training  

    # ================================================================================
    one_dummy_data_for_test=utils_data.get_one_dummy_data_for_test()
    # print("one_dummy_data_for_test",one_dummy_data_for_test.shape)

    aa=xgb.predict([one_dummy_data_for_test])
    # print("aa",aa)
    # [0.]

    for one_k in range(k_fold):
      single_vali_k=vali_k[one_k]

      single_vali_data_k=single_vali_k[:,1:]
      single_vali_lbl_k=single_vali_k[:,0]
      single_vali_lbl_k_np=np.array(single_vali_lbl_k).astype("float16")

      # ================================================================================
      # print("single_vali_data_k",single_vali_data_k.shape)
      # (305, 4)
      PR_Seq=single_vali_data_k[:,0]
      RT_Seq=single_vali_data_k[:,1]
      VL=single_vali_data_k[:,2]
      CD4=single_vali_data_k[:,3]

      # ================================================================================
      PR_Seq_converted=utils_data.process_PR_Seq(PR_Seq)
      # print("PR_Seq_converted",PR_Seq_converted)
      # [[3, 3, 20, 3, 1, 1, 1, 20, 3, 1, 3, 20, 3, 20, 20, 20, 7, 7, 3, 1, 1, 3, 7, 1, 3, 3, 3, 3, 20, 3, 7, 20, 3, 7, 3, 1, 1, 20, 1, 1, 1, 7, 1, 20, 1, 7, 7, 7, 7, 7, 7, 3, 1, 1, 3,

      PR_Seq_converted_df=pd.DataFrame(PR_Seq_converted)
      # print("PR_Seq_converted_df",PR_Seq_converted_df.shape)
      # (307, 297)

      PR_Seq_converted_df=PR_Seq_converted_df.fillna(value=0)
      # print("PR_Seq_converted_df",PR_Seq_converted_df)
      # print("PR_Seq_converted_df",PR_Seq_converted_df.shape)
      # (307, 297)

      PR_Seq_converted_np=np.array(PR_Seq_converted_df)
      # print("PR_Seq_converted_np",PR_Seq_converted_np)
      
      lack=320-PR_Seq_converted_np.shape[1]

      PR_Seq_converted_np=np.pad(PR_Seq_converted_np,((0,0),(0,lack)),'constant')
      # print("PR_Seq_converted_np",PR_Seq_converted_np)
      # print("PR_Seq_converted_np",PR_Seq_converted_np.shape)
      # (307, 320)

      # ================================================================================
      RT_Seq_converted=utils_data.process_RT_Seq(RT_Seq)
      # print("RT_Seq_converted",RT_Seq_converted)
      # [[3, 3, 3, 1, 20, 20, 1, 7, 20, 3, 3, 20, 1, 20, 20, 7, 1, 1, 1, 3, 20, 7, 20, 1, 3, 3, 1, 7, 20, 1, 1, 1, 7, 3, 20, 1, 1, 1, 7, 3, 3, 1, 7, 7, 1, 1, 20, 7, 7, 1, 20, 7, 7, 3, 3, 3, 1, 1, 1, 1,

      RT_Seq_converted_df=pd.DataFrame(RT_Seq_converted)
      # print("RT_Seq_converted_df",RT_Seq_converted_df.shape)
      # (612, 1479)

      RT_Seq_converted_df=RT_Seq_converted_df.fillna(value=0)
      # print("RT_Seq_converted_df",RT_Seq_converted_df)
      # print("RT_Seq_converted_df",RT_Seq_converted_df.shape)
      # (612, 1479)

      RT_Seq_converted_np=np.array(RT_Seq_converted_df)
      # print("RT_Seq_converted_np",RT_Seq_converted_np)
      
      lack=1600-RT_Seq_converted_np.shape[1]

      RT_Seq_converted_np=np.pad(RT_Seq_converted_np,((0,0),(0,lack)),'constant')

      # ================================================================================
      VL=VL[:,np.newaxis]
      VL_np=np.array(VL)
      # print("VL_np",VL_np)
      # [[4.3]
      #  [5.7]
      # print("VL_np",VL_np.shape)
      # (612, 1)
      
      # ================================================================================
      CD4=CD4[:,np.newaxis]
      # print("CD4",CD4)

      CD4_np=np.array(CD4).astype("float16")
      # print("CD4_np",CD4_np)
      # [[145]
      #  [206]
      # print("CD4_np",CD4_np.shape)
      # (612, 1)

      # ================================================================================
      final_vali_data=np.hstack((PR_Seq_converted_np,RT_Seq_converted_np,VL_np,CD4_np))
      final_vali_pred=xgb.predict(final_vali_data)
      # print("final_vali_pred",final_vali_pred)
      # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

      # print("single_vali_lbl_k_np",single_vali_lbl_k_np)
      # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

      b_c_mat=confusion_matrix(single_vali_lbl_k_np,final_vali_pred,labels=[0,1])
      print("b_c_mat",b_c_mat)
      # [[249   0]
      #  [ 39  19]]

      # True Positive (Tumor pic is predicted as tumor)      False Negative (Tumor pic is predicted as non-tumor)
      # False Positive (Non-tumor pic is predicted as tumor) True Negative (Non-tumor pic is predicted as non-tumor)

      # ================================================================================
      report=classification_report(single_vali_lbl_k_np,final_vali_pred,target_names=['class Non tumor (neg)', 'class Tumor (pos)'])
      print("report",report)
      #                        precision    recall  f1-score   support

      # class Non tumor (neg)       0.86      1.00      0.93       249
      #     class Tumor (pos)       1.00      0.33      0.49        58

      #             micro avg       0.87      0.87      0.87       307
      #             macro avg       0.93      0.66      0.71       307
      #          weighted avg       0.89      0.87      0.85       307

      # ================================================================================
      print("accuracy_score",accuracy_score(single_vali_lbl_k_np,final_vali_pred))
      # # 0.8729641693811075

      print("precision_score",precision_score(single_vali_lbl_k_np,final_vali_pred))
      # # 1.0

      print("recall_score",recall_score(single_vali_lbl_k_np,final_vali_pred))
      # # 0.3275862068965517

      # print("fbeta_score",fbeta_score(y_true, y_pred, beta))
      
      print("f1_score",fbeta_score(single_vali_lbl_k_np,final_vali_pred,beta=1))
      # # 0.49350649350649356

      # ================================================================================
      # @ ROC curve
      fpr,tpr,thresholds=roc_curve(single_vali_lbl_k_np,final_vali_pred)
      plt.plot(fpr,tpr,'o-',label="Binary classification")
      plt.title('Receiver Operating Characteristic')
      plt.show()
Example #6
0
def get_k_folds(txt_of_image_data, txt_of_label_data):
    path_of_imgs, num_loaded_imgs = utils_common.return_path_list_from_txt(
        txt_of_image_data)

    path_of_imgs = [one_path.replace("\n", "") for one_path in path_of_imgs]

    path_of_imgs_chunked = utils_common.chunk_proteins_by_4C(path_of_imgs)
    # print("path_of_imgs_chunked",path_of_imgs_chunked)
    # [['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png',
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png',
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png',
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png'],

    train_index_set, validation_index_set = utils_common.split_by_k_folds(
        path_of_imgs_chunked)
    # print("train_index_set",train_index_set)
    # [array([    1,     2,     3, ..., 31063, 31064, 31065]), array([    0,     4,     6, ..., 31069, 31070, 31071]), array([    0,     1,     2, ..., 31069, 31070, 31071])]
    # print("validation_index_set",validation_index_set)
    # [array([    0,     4,     6, ..., 31069, 31070, 31071]), array([    1,     2,     3, ..., 31060, 31061, 31065]), array([   10,    11,    13, ..., 31062, 31063, 31064])]

    # ================================================================================
    train_path_k0 = np.array(path_of_imgs_chunked)[train_index_set[0]]
    train_path_k1 = np.array(path_of_imgs_chunked)[train_index_set[1]]
    train_path_k2 = np.array(path_of_imgs_chunked)[train_index_set[2]]
    # print("train_path_k2",train_path_k2)
    # [['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png\n'
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png\n'
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png\n'
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png\n']

    vali_path_k0 = np.array(path_of_imgs_chunked)[validation_index_set[0]]
    vali_path_k1 = np.array(path_of_imgs_chunked)[validation_index_set[1]]
    vali_path_k2 = np.array(path_of_imgs_chunked)[validation_index_set[2]]
    # print("vali_path_k2",vali_path_k2)
    # [['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0_blue.png\n'
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0_green.png\n'
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0_red.png\n'
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0_yellow.png\n']

    # ================================================================================
    loaded_label_data = pd.read_csv(txt_of_label_data, encoding='utf8')

    loaded_label_data_sorted = loaded_label_data.sort_values(by=["Id"],
                                                             ascending=True)
    # print("loaded_label_data_sorted",loaded_label_data_sorted.head())
    #                                      Id   Target
    # 0  00070df0-bbc3-11e8-b2bc-ac1f6b6435d0     16 0
    # 1  000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0  7 1 2 0
    # print("loaded_label_data_sorted",loaded_label_data_sorted.shape)
    # (31072, 2)

    # c loaded_label_data_sorted_list: label into list
    loaded_label_data_sorted_list = loaded_label_data_sorted.iloc[:, :].values.tolist(
    )

    # print("loaded_label_data_sorted_list",loaded_label_data_sorted_list)
    # [['00070df0-bbc3-11e8-b2bc-ac1f6b6435d0', '16 0'],
    #  ['000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0', '7 1 2 0'],
    #  ['000a9596-bbc4-11e8-b2bc-ac1f6b6435d0', '5'],
    #  ['000c99ba-bba4-11e8-b2b9-ac1f6b6435d0', '1'],

    loaded_label_data_sorted_np = np.array(loaded_label_data_sorted_list)

    train_label_k0 = loaded_label_data_sorted_np[train_index_set[0]]
    train_label_k1 = loaded_label_data_sorted_np[train_index_set[1]]
    train_label_k2 = loaded_label_data_sorted_np[train_index_set[2]]
    # print("train_label_k2",train_label_k2)
    # [['00070df0-bbc3-11e8-b2bc-ac1f6b6435d0' '16 0']
    #  ['000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0' '7 1 2 0']

    vali_label_k0 = loaded_label_data_sorted_np[validation_index_set[0]]
    vali_label_k1 = loaded_label_data_sorted_np[validation_index_set[1]]
    vali_label_k2 = loaded_label_data_sorted_np[validation_index_set[2]]
    # print("vali_label_k2",vali_label_k2)
    # [['002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0' '23']
    #  ['00301238-bbb2-11e8-b2ba-ac1f6b6435d0' '21']

    # ================================================================================
    train_k = [train_path_k0, train_path_k1, train_path_k2]
    vali_k = [vali_path_k0, vali_path_k1, vali_path_k2]
    train_lbl_k = [train_label_k0, train_label_k1, train_label_k2]
    vali_lbl_k = [vali_label_k0, vali_label_k1, vali_label_k2]

    # ================================================================================
    return train_k, vali_k, train_lbl_k, vali_lbl_k
def train(args):
  k_fold=3
  epoch=int(args.epoch)
  batch_size=int(args.batch_size)
  # print("epoch",epoch)
  # print("batch_size",batch_size)
  # 9
  # 2

  # ================================================================================
  text_file_instance=text_file_path_api_module.Path_Of_Text_Files(args)

  txt_of_train_data=text_file_instance.train_data
  # print("txt_of_train_data",txt_of_train_data)
  # /mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train_csv_path.txt

  # ================================================================================
  contents_of_txt,num_line=utils_common.return_path_list_from_txt(txt_of_train_data)
  # print("contents_of_txt",contents_of_txt)
  # ['/mnt/1T-5e7/mycodehtml/prac_data_science/kaggle/hivprogression/My_code/Data/training_data.csv']
  
  # ================================================================================
  train_data_df=pd.read_csv(contents_of_txt[0],encoding='utf8')

  train_data_df=train_data_df.dropna()
  # print("train_data_df",train_data_df.shape)
  # (920, 6)

  train_data_wo_id_df=train_data_df.iloc[:,1:]
  # print("train_data_wo_id_df",train_data_wo_id_df.shape)
  # (920, 5)

  # ================================================================================
  train_k,vali_k=utils_data.get_k_folds(train_data_wo_id_df)

  # ================================================================================
  # c loss_list: list which will stores loss values to plot loss
  loss_list=[]
  f1_score_list=[]

  # ================================================================================
  # c model_api_instance: instance of model API
  model_api_instance=model_api_module.Model_API_class(args)
  # print("model_api_instance",model_api_instance)
  # <src.api_model.model_api_module.Model_API_class object at 0x7fb305557b00>

  # ================================================================================
  # # @ Test Grad CAM
  # imgs=["/mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/train_split/33/82d4d190d2fed1be255fc3bac36a37c860bb31c0.tif",
  #       "/mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/train_split/33/82a5300cd61628fb9bae332cdb7d5e7e37b1fb36.tif"]
  # grad_cam.initialize_grad_cam(model=model_api_instance.gen_net,list_of_img_paths=imgs,args=args)

  # ================================================================================
  if args.task_mode=="train": # If you're in train mode
    
    # ================================================================================
    # @ Configure learning rate scheduler

    # Update learning rate 4 times during entire epochs
    # For example, if you use 10 epochs, int(10/4), 1 2 / 3 4 / 5 6 / 7 8 / 9 10
    # 0-1 epochs: 0.001 -> 2-3 epochs: 0.0001 -> 4-5 epochs: 0.00001 -> 5-6 epochs: 0.000001

    scheduler=StepLR(model_api_instance.optimizer,step_size=int(epoch/4),gamma=0.1)

    # ================================================================================
    for one_k in range(k_fold):
      single_train_k=train_k[one_k]
      single_vali_k=vali_k[one_k]
      single_train_lbl_k=train_lbl_k[one_k]
      single_vali_lbl_k=vali_lbl_k[one_k]

      # ================================================================================
      # @ Validation dataset
      dataset_inst_vali=custom_ds.Custom_DS_vali(single_vali_k,single_vali_lbl_k,args=args)

      dataloader_vali=torch.utils.data.DataLoader(
          dataset=dataset_inst_vali,batch_size=batch_size,shuffle=False,num_workers=3)

      for one_ep in range(epoch): # @ Iterates all epochs
        # print("single_train_k",len(single_train_k))
        # 20714
        # print("single_vali_k",len(single_vali_k))
        # 10358
        # print("single_train_lbl_k",len(single_train_lbl_k))
        # 20714
        # print("single_vali_lbl_k",len(single_vali_lbl_k))
        # 10358

        # ================================================================================
        # c dataset_inst_trn: dataset instance of tumor
        dataset_inst_trn=custom_ds.Custom_DS(single_train_k,single_train_lbl_k,args=args)
        
        # Test iterator
        # iter_dataset_inst_trn=iter(dataset_inst_trn)
        # trn=next(iter_dataset_inst_trn)
        # print("trn",trn)

        # ================================================================================
        # c dataloader_trn: create dataloader
        dataloader_trn=torch.utils.data.DataLoader(
          dataset=dataset_inst_trn,batch_size=batch_size,shuffle=False,num_workers=3)
        
        # # c dataloader_trn_iter: iterator of dataloader
        # dataloader_trn_iter=iter(dataloader_trn)
        # # Test dataloader
        # pairs=next(dataloader_trn_iter)
        # # print("pairs",pairs)

        # ================================================================================
        # c num_imgs_trn: number of train image
        num_imgs_trn=len(dataset_inst_trn)
        # print("num_imgs_trn",num_imgs_trn)
        # 20714

        args.__setattr__("num_imgs_trn",num_imgs_trn)
        # print("args",args)
        
        # ================================================================================
        # print("Current batch size:",batch_size)
        # print("Possible batch size:",list(utils_common.divisorGenerator(num_imgs_trn)))
        # assert str(num_imgs_trn/batch_size).split(".")[-1]==str(0),"Check batch size, currently it's incorrect"

        # ================================================================================
        # @ If you don't use Augmentor
        if args.use_augmentor=="False":
          pass      

        else: # @ If you use Augmentor

          # @ Iterate all images in dataset during single epoch
          for idx,data in enumerate(dataloader_trn):
           
            bs_pa_tumor_d=utils_data.create_batch_pair_of_paths(data,args)
            # print("bs_pa_tumor_d",bs_pa_tumor_d)
            # [[('/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/292d9824-bba1-11e8-b2b9-ac1f6b6435d0_blue.png',
            #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/292d9824-bba1-11e8-b2b9-ac1f6b6435d0_green.png',
            #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/292d9824-bba1-11e8-b2b9-ac1f6b6435d0_red.png',
            #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/292d9824-bba1-11e8-b2b9-ac1f6b6435d0_yellow.png'),
            #   ('/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/7f1e4598-bbc0-11e8-b2bb-ac1f6b6435d0_blue.png',
            #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/7f1e4598-bbc0-11e8-b2bb-ac1f6b6435d0_green.png',
            #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/7f1e4598-bbc0-11e8-b2bb-ac1f6b6435d0_red.png',
            #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/7f1e4598-bbc0-11e8-b2bb-ac1f6b6435d0_yellow.png')],
            #  array(['3','23'],dtype='<U2')]

            # ================================================================================
            # @ Perform data augmentation

            sampled_trn_imgs,label_values=utils_data.use_augmetor_for_data(bs_pa_tumor_d,args)
            # afaf 1: sampled_trn_imgs,label_values=utils_data.use_augmetor_for_data(bs_pa_tumor_d,args)
            
            # print("sampled_trn_imgs",sampled_trn_imgs.shape)
            # (2, 4, 224, 224)
            
            # print("label_values",label_values)
            # [[4], [14]]

            # print("label_values",np.array(label_values).shape)
            # (2, 2)

            # ================================================================================
            oh_label_arr=utils_common.one_hot_label(batch_size,label_values)
            # print("oh_label_arr",oh_label_arr)
            # [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
            #  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

            # ================================================================================
            trn_imgs_tcv=utils_pytorch.get_Variable(sampled_trn_imgs)
            # print("trn_imgs_tcv",trn_imgs_tcv.shape)
            # torch.Size([2, 4, 224, 224])

            # ================================================================================
            # @ Remove existing gradients
            model_api_instance.remove_existing_gradients_before_starting_new_training()
            
            # ================================================================================
            # @ c predicted_labels: pass input images and get predictions
            predicted_labels=model_api_instance.gen_net(trn_imgs_tcv)
            # print("predicted_labels",predicted_labels)
            # tensor([[-0.2858, -0.7700, -0.0600,  0.3553,  0.0367, -0.4130,  0.3102, -0.2443,
            #          -0.1775, -0.1839,  0.0499, -0.1489, -0.9805,  0.1817, -0.0504,  0.8930,
            #          -0.4017, -0.1899,  0.0937, -0.3465,  0.2830, -0.2755,  0.4233, -0.1301,
            #           1.1688,  0.2110,  0.1423, -0.3933],
            #         [-0.2858, -0.7700, -0.0600,  0.3553,  0.0367, -0.4130,  0.3102, -0.2443,
            #          -0.1775, -0.1839,  0.0499, -0.1489, -0.9805,  0.1817, -0.0504,  0.8930,
            #          -0.4017, -0.1899,  0.0937, -0.3465,  0.2830, -0.2755,  0.4233, -0.1301,
            #           1.1688,  0.2110,  0.1423, -0.3933]], device='cuda:0',grad_fn=<AddmmBackward>)

            label_tc=Variable(torch.tensor(oh_label_arr,device=predicted_labels.device).float())
            # print("label_tc",label_tc)
            # tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
            #         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], 
            #         device='cuda:0',dtype=torch.float16)

            # ================================================================================
            # @ Calculate loss values

            loss_val=loss_functions_module.FocalLoss(predicted_labels,label_tc)
            # print("loss_val",loss_val)
            # tensor(6.5374, device='cuda:0', grad_fn=<MeanBackward1>)
                      
            # ================================================================================
            # @ Calculate gradient values through backpropagation
            loss_val.backward()

            # ================================================================================
            # @ Update parameters of the network based on gradients
            model_api_instance.optimizer.step()

            # ================================================================================
            # @ If you want to print loss
            if args.use_loss_display=="True":
              if idx%int(args.leapping_term_when_displaying_loss)==0:
                print("Epoch:",one_ep,", Batch:",idx)
                print("loss_from_one_batch",loss_val.item())
            
            loss_list.append(loss_val.item())

            # ================================================================================
            # @ Save model after every batch you configure 
            # by using args.leapping_term_when_saving_model_after_batch
            if idx%int(args.leapping_term_when_saving_model_after_batch)==0:
              num_batch="batch_"+str(idx)
              model_api_instance.save_model_after_epoch(num_batch)

            # ================================================================================
            # print("end of single batch")

          # ================================================================================
          # print("end of all batches")

        # ================================================================================
        # @ Save model after epoch
        num_epoch="epoch_"+str(one_ep)
        model_api_instance.save_model_after_epoch(num_epoch)

        # ================================================================================
        # @ Update learning rate

        scheduler.step()
        # print("scheduler.base_lrs",scheduler.base_lrs)

        # ================================================================================
        # print("End of single epoch")

      # ================================================================================
      # print("end of all epochs")
      
      # ================================================================================
      with torch.no_grad():
        n=28
        TP=torch.tensor(np.zeros(n)).float().cuda()
        FP=torch.tensor(np.zeros(n)).float().cuda()
        FN=torch.tensor(np.zeros(n)).float().cuda()

        for idx_vali,data_vali in enumerate(dataloader_vali):
          bs_pa_tumor_d_vali=utils_data.create_batch_pair_of_paths(data_vali,args)
          # print("bs_pa_tumor_d_vali",bs_pa_tumor_d_vali)
          # [[('/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/0020af02-bbba-11e8-b2ba-ac1f6b6435d0_blue.png',
          #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/0020af02-bbba-11e8-b2ba-ac1f6b6435d0_green.png',
          #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/0020af02-bbba-11e8-b2ba-ac1f6b6435d0_red.png',
          #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/0020af02-bbba-11e8-b2ba-ac1f6b6435d0_yellow.png'),
          #   ('/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png',
          #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png',
          #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png',
          #    '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png')],
          #  array(['25 2','16 0'],dtype='<U4')]

          img_paths=bs_pa_tumor_d_vali[0]
          labels=bs_pa_tumor_d_vali[1]
          # print("labels",labels)
          # labels ['2 0' '1']
          # print("labels",labels.shape)

          labels=[one_protein_lbl.strip().split(" ") for one_protein_lbl in labels]
          # [['5'], ['0'], ['25'], ['2'], ['23'], ['25', '4'], ['12'], ['22', '2'], ['3'], ['0', '21'], ['2'], ['25', '18', '3', '0'], ['5'], ['2', '0', '21'], ['0', '21'], ['25'], ['25'], ['23'], ['23', '0'], ['25', '2', '0']]
          # print("labels",labels)

          labels_oh=utils_common.one_hot_label_vali(batch_size,labels)
          # print("labels_oh",labels_oh)
          # [[1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
          #  [0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
          labels_oh_np=np.array(labels_oh)
          labels_oh_tc=torch.tensor(labels_oh_np).cuda()

          all_images_vali_stacked=utils_data.get_batch_vali_imgs(img_paths)
          # print("all_images_vali_stacked",all_images_vali_stacked.shape)
          # (2, 4, 224, 224)

          all_images_vali_stacked_tc=utils_pytorch.get_Variable(all_images_vali_stacked)
          # print("all_images_vali_stacked_tc",all_images_vali_stacked_tc.shape)
          # torch.Size([2, 4, 224, 224])

          model_eval=model_api_instance.gen_net.eval()
          pred_vali=model_eval(all_images_vali_stacked_tc)
          # print("pred_vali",pred_vali)
          # print("pred_vali",pred_vali.shape)
          # torch.Size([2, 28])
      
          # ================================================================================
          single_TP,single_FP,single_FN=metrics_module.calculate_f1_score(pred_vali,labels_oh_tc)
          TP+=single_TP
          FP+=single_FP
          FN+=single_FN

        score=(2.0*TP/(2.0*TP+FP+FN+1e-6)).mean()
        print("score",score)
        f1_score_list.append(score.item())
        # tensor(0.0238, device='cuda:0')

    # ================================================================================
    # @ Plot loss value
    plt.plot(loss_list)
    plt.title("Loss value: 1st fold, 2nd fold, 3rd fold, continuously")
    plt.savefig("loss.png")
    plt.show()

    plt.plot(f1_score_list)
    plt.title("F1 score: 1st fold, 2nd fold, 3rd fold, continuously")
    plt.savefig("f1_score.png")
    plt.show()
  
  # ================================================================================
  elif args.task_mode=="validation":
    with torch.no_grad(): # @ Use network without calculating gradients
      # tumor_trn=args.dir_where_text_file_for_image_paths_is_in+"/tumor_trn.txt"
      # tumor_lbl=args.dir_where_text_file_for_image_paths_is_in+"/train_labels.csv"
      # print("tumor_trn",tumor_trn)
      # print("tumor_lbl",tumor_lbl)
      # /mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/tumor_trn.txt
      # /mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/train_labels.csv

      # ================================================================================
      # @ Dataset and Dataloader

      # @ c dataset_inst_test_tumor: dataset instance of tumor dataset
      dataset_inst_test_tumor=custom_ds.custom_ds(
        txt_containing_paths=tumor_trn,txt_containing_labels=tumor_lbl,is_train=False,args=args)

      # @ c dataloader_tumor_test: dataloader instance of tumor dataset
      dataloader_tumor_test=torch.utils.data.DataLoader(
        dataset=dataset_inst_test_tumor,batch_size=batch_size,shuffle=False,num_workers=3)
      
      # ================================================================================
      # @ c num_imgs_test: number of entire test images
      num_imgs_test=len(dataset_inst_test_tumor)

      # ================================================================================
      # @ Create network and optimizer
      if args.train_method=="train_by_transfer_learning_using_resnet":
        model_api_instance=model_api_module.Model_API_class(args)
      
      # ================================================================================
      predicted_values=[]
      true_values=[]
      img_paths=[]

      # ================================================================================
      # @ Iterate all batches (batch1+batch2+...+batchn=entire images)
      for idx,data in enumerate(dataloader_tumor_test):
        # print("idx",idx)
        # print("data",data)
        # [('/mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/train/e693f9ac4097289c317831960514b78701999cd9.tif\n',
        #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/train/e6941f6c6825e7c409b9364e2fb6c2d629df8a76.tif\n',),
        #  [('e693f9ac4097289c317831960514b78701999cd9','e6941f6c6825e7c409b9364e2fb6c2d629df8a76'),tensor([1,0])]]
        
        # ================================================================================
        # @ c imgs: paths of validation images
        imgs=data[0]

        img_paths.extend(imgs)

        # @ c imgs: labels to validation images
        lbls=data[1][1].numpy()

        # @ c num_imgs: number of validation image in one batch
        num_imgs=lbls.shape[0]
        # print("num_imgs",num_imgs)
        # 11
        # @ Load images from paths

        # ================================================================================
        test_imgs_list=[]
        for one_img_path in imgs:
          one_loaded_img=utils_image.load_img(one_img_path)
          # print("one_loaded_img",one_loaded_img.shape)
          # (96, 96, 3)

          one_loaded_img=resize(one_loaded_img,(224,224))

          test_imgs_list.append(one_loaded_img)

        # ================================================================================
        test_imgs_np=np.array(test_imgs_list).transpose(0,3,1,2)
        
        # @ If you want to use center (48,48) image from (96,96) image
        # test_imgs_np=test_imgs_np[:,:,24:72,24:72]
        # print("test_imgs_np",test_imgs_np.shape)
        # (11, 3, 48, 48)

        test_imgs_tc=Variable(torch.Tensor(test_imgs_np).cuda())

        # ================================================================================
        # @ Make predictions

        prediction=model_api_instance.gen_net(test_imgs_tc)
        # print("prediction",prediction)
        # tensor([[-2.0675],
        #         [-2.9296],

        sigmoid=torch.nn.Sigmoid()

        prediction_np=sigmoid(prediction).cpu().numpy()

        # ================================================================================
        # @ Make predicted labels

        prediction_np=np.where(prediction_np>0.5,1,0).squeeze()
        # print("prediction_np",prediction_np)
        # [0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0]
        # print("lbls",lbls)
        # [1 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0]

        predicted_values.extend(prediction_np)
        
        true_values.extend(lbls)
      
      # ================================================================================
      y_true=true_values
      y_pred=predicted_values

      # ================================================================================
      # @ Binary Confusion Matrix

      b_c_mat=confusion_matrix(true_values,predicted_values,labels=[0,1])
      # print("b_c_mat",b_c_mat)
      # [[30  2]
      #  [ 0 68]]

      # True Positive (Tumor pic is predicted as tumor)      False Negative (Tumor pic is predicted as non-tumor)
      # False Positive (Non-tumor pic is predicted as tumor) True Negative (Non-tumor pic is predicted as non-tumor)
      
      # ================================================================================
      # @ metric report
      
      report=classification_report(y_true,y_pred,target_names=['class Non tumor (neg)', 'class Tumor (pos)'])
      # print(report)
      #                        precision    recall  f1-score   support

      # class Non tumor (neg)       0.97      1.00      0.99        68
      #     class Tumor (pos)       1.00      0.94      0.97        32

      #             micro avg       0.98      0.98      0.98       100
      #             macro avg       0.99      0.97      0.98       100
      #          weighted avg       0.98      0.98      0.98       100

      # ================================================================================
      print("accuracy_score",accuracy_score(y_true,y_pred))
      # 0.98

      print("precision_score",precision_score(y_true,y_pred))
      # 1.0

      print("recall_score",recall_score(y_true,y_pred))
      # 0.9375

      # print("fbeta_score",fbeta_score(y_true, y_pred, beta))
      
      print("f1_score",fbeta_score(y_true,y_pred,beta=1))
      # 0.967741935483871

      # ================================================================================
      # @ ROC curve
      fpr,tpr,thresholds=roc_curve(y_true,y_pred)
      plt.plot(fpr,tpr,'o-',label="Logistic Regression")
      plt.title('Receiver operating characteristic example')
      plt.show()

  elif args.task_mode=="submission":
    with torch.no_grad(): # @ Use network without calculating gradients
      
      sub_ds=custom_ds_test.custom_ds_Submission()
      print("sub_ds",sub_ds)

      sub_dl=torch.utils.data.DataLoader(
        dataset=sub_ds,batch_size=batch_size,shuffle=False,num_workers=3)
      print("sub_dl",sub_dl)

      # ================================================================================
      # @ c num_imgs_test: number of entire test images

      num_imgs_test=len(sub_ds)

      # ================================================================================
      # @ Create network and optimizer

      if args.train_method=="train_by_transfer_learning_using_resnet":
        model_api_instance=model_api_module.Model_API_class(args)

      # ================================================================================
      label_submission=pd.read_csv("/mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/sample_submission.csv",encoding='utf8')
      base_names=label_submission.iloc[:,0].tolist()
      # print("base_names",base_names)

      # ================================================================================
      predicted_values=[]
      # @ Iterate all batches (batch1+batch2+...+batchn=entire images)
      for idx,data in enumerate(sub_dl):
        # print("idx",idx)
        # print("data",data)
        # 0
        # ['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle_histopathologic-cancer-detection/Data/test/0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5.tif',
        
        imgs=data

        # ================================================================================
        test_imgs_list=[]
        for one_img_path in imgs:
          one_loaded_img=utils_image.load_img(one_img_path)
          # print("one_loaded_img",one_loaded_img.shape)
          # (96, 96, 3)

          one_loaded_img=resize(one_loaded_img,(224,224))

          test_imgs_list.append(one_loaded_img)

        # ================================================================================
        test_imgs_np=np.array(test_imgs_list).transpose(0,3,1,2)
        
        # @ If you want to use center (48,48) image from (96,96) image
        # test_imgs_np=test_imgs_np[:,:,24:72,24:72]
        # print("test_imgs_np",test_imgs_np.shape)
        # (11, 3, 48, 48)

        test_imgs_tc=Variable(torch.Tensor(test_imgs_np).cuda())
        # print("test_imgs_tc",test_imgs_tc.shape)
        # torch.Size([30, 3, 224, 224])

        # ================================================================================
        # @ Make predictions
        prediction=model_api_instance.gen_net(test_imgs_tc)
        # print("prediction",prediction)
        # tensor([[-2.0675],
        # ...
        #         [-1.2222]], device='cuda:0')

        sigmoid=torch.nn.Sigmoid()

        prediction_np=sigmoid(prediction).cpu().numpy()

        # ================================================================================
        # @ Make predicted labels

        prediction_np=np.where(prediction_np>0.5,1,0).squeeze()
        # print("prediction_np",prediction_np)
        # [0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0]
        # print("lbls",lbls)
        # [1 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0]

        predicted_values.extend(prediction_np)
     
      my_submission=pd.DataFrame({'id': base_names,'label': predicted_values})
      my_submission.to_csv('youngminpar2559_submission.csv',index=False)
Example #8
0
def train(args):
  # dir_Docs="/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Docs/*.txt"
  # dir_Vocab="/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00001_Classify_document_by_using_vocab_file/My_code/Data/Vocab/*.txt"

  # utils_find_similarity_of_docs_against_vocab_files.calculate(dir_Docs,dir_Vocab,args)

  # utils_find_similarity_of_docs_against_vocab_files.color_word(dir_Docs,dir_Vocab,args)
  # afaf

  vocab_txt_path="/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00002_Grad_CAM_on_text_classification/My_code/Data/vocab.txt"
  text_data_path="/mnt/1T-5e7/Companies/Sakary/Management_by_files/00002_Architecture_specific_projects/00002_Grad_CAM_on_text_classification/My_code/Data/text_data.csv"

  # ================================================================================
  contents,num_line=utils_common.return_path_list_from_txt(vocab_txt_path)
  contents=list(map(lambda x:x.replace("\n",""),contents))
  # print("contents",contents)
  # ['<unk>', '<pad>', '', 'the', ',', 'a', 'and', 'of', 'to', 'is', 'in', 'that', 'it', 'as', 'but', 'with', 'film', 'this', 'for', 'its', 'an', 'movie', "it 's", 'be', 'on', 'you', 'not', 'by', 
  # print("contents",len(contents))
  # 21114

  # ================================================================================
  args.__setattr__("embed_num",len(contents))
  args.__setattr__("embed_dim",128)
  args.__setattr__("class_num",2)
  args.__setattr__("kernel_sizes",[3,4,5])
  args.__setattr__("kernel_num",100)
  args.__setattr__("save_dir",os.path.join(".",datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
  
  # ================================================================================
  model_API_obj=model_api_module.Model_API_class(args)
  # print("model_API_obj",model_API_obj)
  # <src.api_model.model_api_module.Model_API_class object at 0x7f911af64518>
  # afaf 2: model_API_obj=model_api_module.Model_API_class(args)

  # ================================================================================
  epoch=int(args.epoch)
  batch_size=int(args.batch_size)
  # print("epoch",epoch)
  # print("batch_size",batch_size)
  # 9
  # 2
  
  # ================================================================================
  # c loss_list: list which will stores loss values to plot loss
  loss_list=[]
  f1_score_list=[]

  # ================================================================================
  for one_ep in range(epoch): # @ Iterates all epochs
    custom_ds_obj=custom_ds.Custom_DS(text_data_path,args)

    custom_ds_iter=iter(custom_ds_obj)

    num_all_sentences=len(custom_ds_obj)
    # print("num_all_sentences",num_all_sentences)
    # 16

    args.__setattr__("num_all_sentences",num_all_sentences)

    num_iteration_for_iter=int(int(num_all_sentences)/int(args.batch_size))
    # print("num_iteration_for_iter",num_iteration_for_iter)
    # 8

    # ================================================================================
    for one_iter in range(num_iteration_for_iter):
      # @ Remove gradients
      model_API_obj.remove_existing_gradients_before_starting_new_training()

      # ================================================================================
      train_over_sentiment_dataset.train(custom_ds_iter,batch_size,model_API_obj,contents,args)
def visualize_images(args):

    # ================================================================================
    loaded_path, num_imgs = utils_common.return_path_list_from_txt(train_imgs)
    # print("loaded_path",loaded_path)
    # ['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png\n',
    #  '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png\n',

    # print("num_imgs",num_imgs)
    # 124288

    loaded_path_chunked = []
    for i in range(0, int(num_imgs / 4), 4):
        one_protein = loaded_path[i:i + 4]
        # print("one_protein",one_protein)
        # ['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png\n',
        #  '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png\n',
        #  '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png\n',
        #  '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png\n']

        loaded_path_chunked.append(one_protein)

    # print("loaded_path_chunked",loaded_path_chunked)
    # [['/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png\n',
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png\n',
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png\n',
    #   '/mnt/1T-5e7/mycodehtml/bio_health/Kaggle/human-protein-atlas-image-classification/Data/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png\n'

    path_3_proteins = loaded_path_chunked[:3]

    # ================================================================================
    images_of_3_proteins = []
    for one_protein in path_3_proteins:
        b_img = one_protein[0].replace("\n", "")
        g_img = one_protein[1].replace("\n", "")
        r_img = one_protein[2].replace("\n", "")
        y_img = one_protein[3].replace("\n", "")

        b_img = utils_image.load_img(b_img)
        g_img = utils_image.load_img(g_img)
        r_img = utils_image.load_img(r_img)
        y_img = utils_image.load_img(y_img)

        # print("b_img",b_img.shape)
        # (512, 512)
        # print("g_img",g_img.shape)
        # (512, 512)
        # print("r_img",r_img.shape)
        # (512, 512)
        # print("y_img",y_img.shape)
        # (512, 512)

        images_of_3_proteins.append([b_img, g_img, r_img, y_img])

    i = 0
    for one_protein_img in images_of_3_proteins:
        bg_img = np.zeros(
            (one_protein_img[0].shape[0], one_protein_img[0].shape[1], 3))
        # print("bg_img",bg_img.shape)
        # (512, 512, 3)

        bg_img_flat_for_b = bg_img.reshape(-1, 3).copy()
        bg_img_flat_for_g = bg_img.reshape(-1, 3).copy()
        bg_img_flat_for_r = bg_img.reshape(-1, 3).copy()
        bg_img_flat_for_y = bg_img.reshape(-1, 3).copy()

        # print("one_protein_img[0]",one_protein_img[0].shape)
        # (512, 512)
        # print("one_protein_img[1]",one_protein_img[1].shape)
        # (512, 512)
        # print("one_protein_img[2]",one_protein_img[2].shape)
        # (512, 512)
        # print("one_protein_img[3]",one_protein_img[3].shape)
        # (512, 512)

        b_img = one_protein_img[0].reshape(-1)
        g_img = one_protein_img[1].reshape(-1)
        r_img = one_protein_img[2].reshape(-1)
        y_img = one_protein_img[3].reshape(-1)

        rgb_img = np.stack((one_protein_img[0], one_protein_img[1],
                            one_protein_img[2])).transpose(1, 2, 0)
        # print("rgb_img",rgb_img.shape)

        # ================================================================================
        import scipy.misc
        scipy.misc.imsave('./img_out/rgb_img_' + str(i) + '.png', rgb_img)

        bg_img_flat_for_b[:, 2] = b_img
        scipy.misc.imsave('./img_out/b_img_' + str(i) + '.png',
                          bg_img_flat_for_b.reshape(512, 512, 3))

        bg_img_flat_for_g[:, 1] = g_img
        scipy.misc.imsave('./img_out/g_img_' + str(i) + '.png',
                          bg_img_flat_for_g.reshape(512, 512, 3))

        bg_img_flat_for_r[:, 0] = r_img
        scipy.misc.imsave('./img_out/r_img_' + str(i) + '.png',
                          bg_img_flat_for_r.reshape(512, 512, 3))

        bg_img_flat_for_y[:, 0] = y_img
        scipy.misc.imsave('./img_out/y_img_' + str(i) + '.png',
                          bg_img_flat_for_y.reshape(512, 512, 3))

        i = i + 1