Ejemplo n.º 1
0
 def save(self, s_file=None, s_graph=None, l_remove_loop=False):
     if s_file is not None:
         fname, ext = os.path.splitext(s_file)
         s_name = os.path.split(fname)[1]
     else:
         fname = s_graph or self.name or 'Untitled'
         s_name = fname
     ext = '.xgmml'
     s_out = self.print_header(s_name)
     S_node = []
     S_edge = []
     S_REMOVE_NODE = set(["label", "id", "canonicalName"])
     S = self.T_node.header()
     S_node.extend([
         s for s in S
         if s not in S_REMOVE_NODE and not s.startswith('graphics')
     ])
     S_REMOVE_EDGE = set([
         "label", "Gene_A", "Name_A", "InteractionType", "Gene_B", "Name_B",
         "canonicalName", "InteractionType", "TYPE"
     ])
     S = self.T_edge.header()
     S_edge.extend([s for s in S if s not in S_REMOVE_EDGE])
     s_out += self.print_xgmml(self.T_node,
                               self.T_edge,
                               "Gene",
                               "Symbol",
                               S_node,
                               S_edge,
                               l_remove_loop=l_remove_loop)
     util.save_list(fname + ext, s_out, s_end='\n')
Ejemplo n.º 2
0
def register_a_face():
    if request.method == 'POST':

        name = request.form.get('name')

        f = request.files['image']
        f.save(os.path.join(temp_filepath, f.filename))
        img_list = os.listdir(temp_filepath)

        image = face_recognition.load_image_file(
            os.path.join(temp_filepath, img_list[0]))

        encoded_image = face_recognition.face_encodings(image)[0]

        encoded_image_list = (util.load_list())["faces"]
        encoded_image_list.append({
            'name': name,
            'encoded_image': encoded_image.tolist()
        })
        util.save_list({"faces": encoded_image_list})

        util.flush_files('temp')

        return "registered"

    if request.method == 'GET':
        encoded_image_json = util.load_list()
        return encoded_image_json

    if request.method == 'DELETE':
        util.save_list({'faces': []})
        return 'deleted'
Ejemplo n.º 3
0
 def _fix_missing(s_file):
     'This is no longer needed, fixed cwc on May 2, 2013'
     lines = []
     with open(s_file) as f:
         for line in f:
             lines.append(re.sub(r'(?<=\s)340282346638528859811704183484516925440.000', '', line))
     util.save_list(s_file, lines)
Ejemplo n.º 4
0
 def _strip_array_line(s_file):
     with open(s_file) as f:
         lines = f.readlines()
     s=lines[1]
     if s.startswith('AID'):
         del lines[1:2]
         util.save_list(s_file, lines)
         return s
     return ''
Ejemplo n.º 5
0
    def make_input(self, s_file='untitled', options=None):
        if self.table is None: util.error_msg('Clustering.make_input: missing Clustering.table!')
        S=self.table.header()
        S_up=[ s.upper() for s in S]
        opt=self.input_opt
        opt.update(options or {})
        self.input_opt=opt
        S_miss=[s for s in opt['DATA_COLS'] if S.index(s)<0]
        if len(S_miss)>0: util.error_msg('Clustering.make_input: missing data column: '+", ".join(S_miss))
        i_id=util.index(opt['ID'], S)
        if (i_id<0):
            i_id=S_up.index('GENE')
            if i_id<0: util.error_msg('Clustering.make_input: no column is specified as the ID!')
            opt['ID']=S[i_id]
        if type(opt['DESCRIPTION']) is str: opt['DESCRIPTION']=[opt['DESCRIPTION']]
        I_des=[util.index(s, S) for s in opt['DESCRIPTION'] if util.index(s, S)>=0]

        if (len(I_des)==0):
            I_des=[i_id]
            opt['DESCRIPTION']=[opt['ID']]
        else:
            for i in I_des:
                self.table.iloc[:, i]=util.sarray2sarray(self.table.iloc[:,i])
        i_w=util.index(opt['WEIGHT_COL'], S)
        opt['DATA_COLS']=self.get_default_exp_cols(opt['DATA_COLS'])
        n_exp=len(opt['DATA_COLS'])
        if n_exp==0: util.error_msg('Clustering.make_input: no data column is specified!')

        S_out=[]
        S_out.append('Gene\tDescription\tWeight\t'+'\t'.join(opt['DATA_COLS']))
        if opt['EXP_WEIGHT'] is None or len(opt['EXP_WEIGHT'])!=n_exp:
            S_out.append('Exp\t\t'+'\t1'*n_exp)
        else:
            S_out.append('Exp\t\t\t'+'\t'.join(util.rarray2sarray(opt['EXP_WEIGHT'], s_format='%g', s_null=1.0)))
        #df.fillna('', inplace=True)
        i_cols=[S.index(s) for s in opt['DATA_COLS']]
        if opt['GENE_WEIGHT'] is not None and len(opt['GENE_WEIGHT'])==len(self.table):
            if opt['WEIGHT_COL']=='':
                opt['WEIGHT_COL']='WEIGHT'
            self.table[opt['WEIGHT_COL']]=opt['GENE_WEIGHT']
        for i in range(len(self.table)):
            s=str(self.table.iloc[i, i_id])+'\t'+":".join(self.table.iloc[i, I_des])+'\t'+str(self.table.iloc[i, i_w] if i_w>=0 else 1)
            R=np.array([x for x in self.table.iloc[i,i_cols]])
            if opt['GENE_NORMALIZE'] and opt['NORMALIZE_METHOD']=='Z':
                valid=util.no_nan(R)
                if len(valid)>1:
                    R=(R-np.mean(valid))/np.std(R, ddof=1)
            s+='\t'+'\t'.join(['' if pd.isnull(x) else str(x) for x in R])
            S_out.append(s)
        if re.search(r'\.input$', s_file) is not None:
            s_file=re.sub(r'\.input$', '', s_file)
        util.save_list(s_file+".input", S_out, s_end='\n')
        self.input=s_file
Ejemplo n.º 6
0
def add_column(s_file, R, s_name, l_separator=True):
    """Add an extra column using value R array to an existing heat map.
    s_file: str, file name without extension, it will modify .cdt and .atr
    R: array(int/float), values to add
    s_name: str, column name
    l_separator: bool, default True. If True, add a column of blank value to separate the new column from existing ones."""
    if re.search('\.\w{3}$', s_file):
        s_file = s_file[:-4]
    if not os.path.exists(s_file + '.cdt'):
        util.error_msg("File not exist: " + s_file + ".cdt!")
    f = open(s_file + '.cdt')
    S = []
    cnt = 0

    while True:
        line = f.readline()
        if not line: break
        SS = line.strip().split("\t")
        if SS[0].startswith('GENE'):
            if l_separator:
                SS.append('')
            SS.append('%.2f' % R[cnt])
            cnt += 1
        elif SS[0] == 'GID':
            if l_separator:
                SS.append('separator')
            SS.append(s_name)
        elif SS[0] == 'AID':
            X = [int(re.sub(r'\D', '', x)) for x in SS if x.startswith('ARRY')]
            n_array = max(X) + 1
            SS.append('ARRY%dX' % n_array)
            if l_separator:
                SS.append('ARRY%dX' % (n_array + 1))
        elif SS[0] == 'EWEIGHT':
            if l_separator:
                SS.append('0')
            SS.append('0')
        S.append(SS)
    f.close()
    S = ["\t".join(X) for X in S]
    util.save_list(s_file + '.cdt', S, s_end="\n")

    if os.path.exists(s_file + '.atr'):
        S = util.read_list(s_file + '.atr')
        SS = S[-1].split("\t")
        n_node = int(re.sub(r'\D', '', SS[0])) + 1
        S.append('NODE%dX\tNODE%dX\tARRY%dX\t0' %
                 (n_node, n_node - 1, n_array))
        if l_separator:
            S.append('NODE%dX\tNODE%dX\tARRY%dX\t0' %
                     (n_node + 1, n_node, n_array + 1))
        util.save_list(s_file + '.atr', S, s_end="\n")
Ejemplo n.º 7
0
 def make_JTV(s_file, r_max=2.0):
     s='''<DocumentConfig>
 <UrlExtractor/>
 <ArrayUrlExtractor/>
 <Views>
     <View type="Dendrogram" dock="1">
         <ColorExtractor contrast="%s">
             <ColorSet up="#D8181C" zero="#D8D8D8" down="#3A6C9A" missing="#D8D8D8"/>
         </ColorExtractor>
         <ArrayDrawer/>
         <GlobalXMap current="Fill">
             <FixedMap type="Fixed" scale="7.0"/>
             <FillMap type="Fill"/><NullMap type="Null"/>
         </GlobalXMap>
         <GlobalYMap current="Fill">
             <FixedMap type="Fixed" scale="11.0"/>
             <FillMap type="Fill"/>
             <NullMap type="Null"/>
         </GlobalYMap>
         <ZoomXMap current="Fill">
             <FixedMap type="Fixed"/>
             <FillMap type="Fill"/>
             <NullMap type="Null"/>
         </ZoomXMap>
         <ZoomYMap current="Fill">
             <FixedMap type="Fixed"/>
             <FillMap type="Fill"/>
             <NullMap type="Null"/>
         </ZoomYMap>
         <TextView>
             <TextView>
                 <GeneSummary/>
             </TextView>
             <TextView>
                 <GeneSummary/>
             </TextView>
             <TextView>
                 <GeneSummary/>
             </TextView>
             <TextView>
                 <GeneSummary/>
             </TextView>
         </TextView>
         <ArrayNameView>
             <ArraySummary included="0"/>
         </ArrayNameView>
         <AtrSummary/>
         <GtrSummary/>
     </View>
 </Views></DocumentConfig>''' % (str(r_max))
     util.save_list(s_file+".jtv", s)
Ejemplo n.º 8
0
def process_vehicle_image():
    imw = 64
    imh = 64
    data_path = '/home/kien/PycharmProjects/data/vietai-assignment-data/vehicles/'
    car_paths = sorted(glob.glob(data_path + 'car*.jpg'))
    motorbike_paths = sorted(glob.glob(data_path + 'motorbike*.jpg'))
    train_x_car = np.zeros((imh, imw, 1500))
    train_y_car = np.zeros((1500, 1))

    for i, car_path in enumerate(car_paths):
        img = scipy.misc.imread(car_path)
        img = scipy.misc.imresize(img, (imh, imw), interp='bicubic')
        img = np.mean(img, axis=2)
        train_x_car[:,:,i] = img

    test_x_car = train_x_car[:,:,1200:]
    test_y_car = train_y_car[1200:]
    train_x_car = train_x_car[:,:,:1200]
    train_y_car = train_y_car[:1200]

    train_x_mo = np.zeros((imh, imw, 1500))
    train_y_mo = np.ones((1500, 1))

    for i, motorbike_path in enumerate(motorbike_paths):
        img = scipy.misc.imread(motorbike_path)
        img = scipy.misc.imresize(img, (imh, imw), interp='bicubic')
        img = np.mean(img, axis=2)
        train_x_mo[:,:,i] = img

    test_x_mo = train_x_mo[:,:,1200:]
    test_y_mo = train_y_mo[1200:]
    train_x_mo = train_x_mo[:,:,:1200]
    train_y_mo = train_y_mo[:1200]

    train_x = np.concatenate((train_x_car, train_x_mo), axis=2)
    train_y = np.concatenate((train_y_car, train_y_mo), axis=0)
    test_x = np.concatenate((test_x_car, test_x_mo), axis=2)
    test_y = np.concatenate((test_y_car, test_y_mo), axis=0)
    
    #plt.ion()
    #for i in range(0,600,24):
    #    plt.clf()
    #    plt.imshow(test_x[:,:,i], cmap='gray')
    #    plt.show()
    #    plt.pause(0.2)
    #    print("%d %d" % (i, test_y[i, 0]))

    #pdb.set_trace()
    save_list([train_x, train_y, test_x, test_y], './data/vehicles.dat')
Ejemplo n.º 9
0
 def save(self, s_cdtfile, l_norm_row=False, r_max=2.0):
     """r_max controls the max matrix value to be color saturated"""
     X=self.data
     s_cdtfile, s_ext=os.path.splitext(s_cdtfile)
     S_row=self.S_row
     if self.Zr is not None:
         #den_r=clst.dendrogram(self.Zr)
         den_r=FastCluster.linkage2order(self.Zr)
         #X=X[den_r['leaves'], :]
         X=X[den_r, :]
         S=[] #"NODEID\tLEFT\RIGHT\tCORRELATION"]
         r_dist=max(self.Zr[:, 2].max(), 1.0)
         n=X.shape[0]
         node_cnt=0
         S_gene=["GENE%dX" % (x+1) for x in den_r] #['leaves']]
         S_row=[self.S_row[i] for i in den_r] #['leaves']]
         S_description=[self.S_description[i] for i in den_r]#['leaves']]
         for i,R in enumerate(self.Zr):
             node_cnt+=1
             s_left="GENE%dX" % int(R[0]+1) if int(R[0])<n else "NODE%dX" % (int(R[0]-n+1))
             s_right="GENE%dX" % int(R[1]+1) if int(R[1])<n else "NODE%dX" % (int(R[1]-n+1))
             S.append("NODE%dX\t%s\t%s\t%.4f" % (node_cnt, s_left, s_right, max(1.0-R[2]/r_dist, 0.0)))
             util.save_list(s_cdtfile+'.gtr', S, s_end="\n")
     S_col=self.S_col
     if self.Zc is not None:
         #den_c=clst.dendrogram(self.Zc)
         den_c=FastCluster.linkage2order(self.Zc)
         X=X[:, den_c]#['leaves']]
         S=[]
         r_dist=max(self.Zc[:, 2].max(), 1.0)
         n=X.shape[1]
         node_cnt=0
         S_array=["ARRY%dX" % (x+1) for x in den_c]#['leaves']]
         S_col=[self.S_col[i] for i in den_c]#['leaves']]
         for i,R in enumerate(self.Zc):
             node_cnt+=1
             s_left="ARRY%dX" % int(R[0]+1) if int(R[0])<n else "NODE%dX" % (int(R[0]-n+1))
             s_right="ARRY%dX" % int(R[1]+1) if int(R[1])<n else "NODE%dX" % (int(R[1]-n+1))
             S.append("NODE%dX\t%s\t%s\t%.4f" % (node_cnt, s_left, s_right, max(1.0-R[2]/r_dist, 0.0)))
             util.save_list(s_cdtfile+'.atr', S, s_end="\n")
     n_exp=len(S_col)
     S=["GID\tGENE\tNAME\tGWEIGHT\t"+"\t".join(S_col)]
     if self.Zc is not None:
         S.append("AID\t\t\t\t"+"\t".join(S_array))
     S.append("EWEIGHT\t\t\t"+"\t1"*n_exp)
     for i,R in enumerate(X):
         if l_norm_row:
             R=(R-R.mean())/R.std()
         S.append(S_gene[i]+"\t"+S_row[i]+"\t"+S_description[i]+"\t1\t"+"\t".join(util.rarray2sarray(R, s_format="%.3f")))
     util.save_list(s_cdtfile+".cdt", S, s_end="\n")
     import cluster
     Clustering.make_JTV(s_cdtfile, r_max=r_max)
Ejemplo n.º 10
0
        cost = tf.reduce_sum(tf.multiply(log_dist, masks))

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping is applied to mitigate the issue of exploding gradients
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients, global_step=global_step, name='train_op')


# split original data into training and test data
source_train, source_test, target_train, target_test = train_test_split(source_int_text,
                                                                        target_int_text, test_size=0.01, random_state=42)
# save dataset
util.save_list('source_train', source_train)
util.save_list('target_train', target_train)
util.save_list('source_test', source_test)
util.save_list('target_test', target_test)


print("The size for test data {},{}".format(len(source_test), len(target_test)))

# Split data to training and validation sets
valid_size = 1280
train_source = source_train[valid_size:]
train_target = target_train[valid_size:]
valid_source = source_train[:valid_size]
valid_target = target_train[:valid_size]

print("The size for training data {},{}".format(len(train_source), len(train_target)))
Ejemplo n.º 11
0
def test():
    # Getting settings from config.py
    max_len = cfg.MAX_TOKEN_LEN
    num_token = cfg.NUM_OF_TOKEN
    imw = cfg.IMW
    imh = cfg.IMH

    # Training params
    is_train = False
    batch_size = 1

    # Tracking/Saving
    num_ite_to_log = cfg.NUM_ITE_TO_LOG
    num_ite_to_vis = cfg.NUM_ITE_TO_VIS
    save_name = cfg.SAVE_NAME
    test_name = cfg.TEST_NAME
    vis_path = cfg.VIS_PATH

    use_cuda = cfg.CUDA and torch.cuda.is_available()
    save_path = cfg.MODEL_FOLDER
    dataset_path = cfg.DATASET_PATH + 'CROHME2013_data/TestINKML/'
    scale_factor = cfg.TEST_SCALE_FACTOR

    # Load the vocab dictionary for display purpose
    word_to_id, id_to_word = get_gt.build_vocab('mathsymbolclass.txt')
    start_id = word_to_id['<s>']
    stop_id = word_to_id['</s>']

    # Initialize the network and load its weights
    net = AGRU()
    save_files = glob.glob(save_path + save_name + '*.dat')
    if (len(save_files) > 0):
        save_file = sorted(save_files)[-1]
        print('Loading network weights saved at %s...' % save_file)
        loadobj = torch.load(save_file)
        net.load_state_dict(loadobj['state_dict'])
        print('Loading done.')

    if (use_cuda):
        net.cuda()

    # For debugging
    if (not is_train):
        net.train(False)

    # Get full paths to test inkml files, create a list of scale factors to be used for rendering test images
    inkml_list = glob.glob(dataset_path + '*.inkml')
    scale_list = [scale_factor] * len(inkml_list)
    inkml_list = np.asarray(inkml_list)
    scale_list = np.asarray(scale_list)

    #inkml_list = inkml_list[0:120]
    #scale_list = scale_list[0:120]
    num_test = len(inkml_list)
    num_ite = int(np.ceil(1.0 * num_test / batch_size))

    # Exact match and word error rate
    em = []
    wer = []
    all_pred = []
    all_gt = []
    # Main test loop
    for i in range(num_ite):
        batch_idx = range(i * batch_size, (i + 1) * batch_size)
        if (batch_idx[-1] >= num_test):
            batch_idx = range(i * batch_size, num_test)
        batch_size = len(batch_idx)
        batch_x = util.batch_data(inkml_list[batch_idx], scale_list[batch_idx],
                                  is_train)
        batch_y_np = util.batch_target(inkml_list[batch_idx])
        batch_y = util.np_to_var(batch_y_np, use_cuda)

        #pred_y, attention = net(batch_x, batch_y)
        pred_y, attention = net.beam_search(batch_x, start_id, stop_id)
        pred_y = util.var_to_np(pred_y, use_cuda)
        pred_y = np.argmax(pred_y, 2)
        batch_y = np.reshape(batch_y_np, (batch_size, max_len))

        print('Finished ite %d/%d.' % (i, num_ite))
        j = 0

        pred_string = pred_y[j, :]
        pred_string = [id_to_word[idx] for idx in list(pred_string)]
        gt_string = batch_y[0, :]
        gt_string = [id_to_word[idx] for idx in list(gt_string)]
        all_pred.append(pred_string)
        all_gt.append(gt_string)
        em.append(util.exact_match(pred_string, gt_string))
        if ('</s>' in pred_string):
            pred_string = pred_string[0:pred_string.index('</s>') + 1]
        gt_string = gt_string[0:gt_string.index('</s>') + 1]
        wer.append(util.levenshtein_distance(pred_string, gt_string))

        if (i % 4 == 0):
            continue

        # Printing stuffs to console
        print('Prediction: %s' % ' '.join(pred_string))
        print('Target: %s\n' % ' '.join(gt_string))

        # Save attention to files for visualization
        file_name = ntpath.basename(inkml_list[batch_idx[j]])[:-6]
        vis_path_j = vis_path + file_name + '/'
        if (not os.path.exists(vis_path_j)):
            os.makedirs(vis_path_j)

        tmp_x = np.sum(batch_x.data.cpu().numpy()[j, :, :, :], axis=0)
        attention_np = attention.data.cpu().numpy()[j, 1:, :, :]
        pred_string = pred_string[1:]
        for k, word in enumerate(pred_string):
            word = word.replace('/', 'slash_')
            attention_k = attention_np[k, :, :] / np.max(
                attention_np[k, :, :]) * 0.8
            attention_k = (scipy.misc.imresize(attention_k, 16.0)) / 255.0
            tmp_x = scipy.misc.imresize(tmp_x, attention_k.shape)
            attention_k += tmp_x
            attention_k[attention_k > 1] = 1
            try:
                scipy.misc.imsave(vis_path_j + ('%02d_%s.jpg' % (k, word)),
                                  attention_k)
            except FileNotFoundError:
                pdb.set_trace()
            if (word == '<slash_s>'):
                break

        #pdb.set_trace()

    print("Exact match count: %d/%d" % (sum(em), len(em)))
    print("Word error rate: %.5f" % (np.mean(wer)))
    pdb.set_trace()
    util.save_list([em, wer, all_pred, all_gt], save_path + test_name + '.dat')

    pdb.set_trace()
Ejemplo n.º 12
0
    if args.frequency_results != None:
        config['frequency_results'] = args.frequency_results

    if args.profile:
        # When profiling, just run the configuration
        import cProfile
        cProfile.run("all_runs(config)", sort=2)
        sys.exit()

    try:
        # Perform the actual run of the experiment
        raw_results, frequencies = all_runs(config)
        combined = sorted(combine_results(raw_results).items())
        print combined
        if args.output_results != None:
            # Output the results, with the combined stuff on the first line
            util.save_list(args.output_results, [combined] + raw_results)
        if args.output_config != None:
            # Serialize function list
            config['function_list'] = [func.__name__ for func in
                                       config['function_list']]
            # Saves the final configuration as a single file
            util.save_configuration(args.output_config, config)
        if args.frequency_results != None:
            # Saves the frequency information
            processed = frequencies_to_vector(config, frequencies)
            util.save_configuration(args.frequency_results, processed)
    except KeyError as e:
        print 'You must include a configuration value for', e.args[0]
Ejemplo n.º 13
0
    if args.frequency_results != None:
        config['frequency_results'] = args.frequency_results

    if args.profile:
        # When profiling, just run the configuration
        import cProfile
        cProfile.run("all_runs(config)", sort=2)
        sys.exit()

    try:
        # Perform the actual run of the experiment
        raw_results, frequencies = all_runs(config)
        combined = sorted(combine_results(raw_results).items())
        print combined
        if args.output_results != None:
            # Output the results, with the combined stuff on the first line
            util.save_list(args.output_results, [combined] + raw_results)
        if args.output_config != None:
            # Serialize function list
            config['function_list'] = [func.__name__ for func in
                                       config['function_list']]
            # Saves the final configuration as a single file
            util.save_configuration(args.output_config, config)
        if args.frequency_results != None:
            # Saves the frequency information
            processed = frequencies_to_vector(config, frequencies)
            util.save_configuration(args.frequency_results, processed)
    except KeyError as e:
        print 'You must include a configuration value for', e.args[0]
Ejemplo n.º 14
0
def train():
    # Getting settings from config.py
    max_len = cfg.MAX_TOKEN_LEN
    num_token = cfg.NUM_OF_TOKEN
    imw = cfg.IMW
    imh = cfg.IMH

    # Training params
    is_train = True
    batch_size_const = cfg.GPU_BATCH_SIZE
    num_ite_to_update = cfg.NUM_ITE_TO_UPDATE
    lr = cfg.LR
    momentum = cfg.MOMENTUM
    lr_decay = cfg.LR_DECAY
    max_grad = cfg.MAX_GRAD_CLIP
    num_e = cfg.NUM_EPOCH

    # Tracking/Saving
    last_e = -1
    global_step = 0
    running_loss = 0
    num_ite_to_log = cfg.NUM_ITE_TO_LOG
    num_ite_to_vis = cfg.NUM_ITE_TO_VIS
    num_epoch_to_save = cfg.NUM_EPOCH_TO_SAVE
    all_loss = []
    save_name = cfg.SAVE_NAME
    meta_name = cfg.META_NAME
    vis_path = cfg.VIS_PATH

    use_cuda = cfg.CUDA and torch.cuda.is_available()
    save_path = cfg.MODEL_FOLDER
    dataset_path = cfg.DATASET_PATH + 'CROHME2013_data/TrainINKML/'
    subset_list = cfg.SUBSET_LIST
    scale_factors = cfg.SCALE_FACTORS

    # Load the vocab dictionary for display purpose
    _, id_to_word = get_gt.build_vocab('mathsymbolclass.txt')

    # Initialize the network and load its weights
    net = AGRU()
    save_files = glob.glob(save_path + save_name + '*.dat')
    meta_files = glob.glob(save_path + meta_name + '*.dat')
    if (len(save_files) > 0):
        save_file = sorted(save_files)[-1]
        print('Loading network weights saved at %s...' % save_file)
        loadobj = torch.load(save_file)
        net.load_state_dict(loadobj['state_dict'])
        last_e, running_loss, all_loss, lr = util.load_list(
            sorted(meta_files)[-1])
        print('Loading done.')

    if (use_cuda):
        net.cuda()

    # For debugging
    if (not is_train):
        net.train(False)

    # Get a list of convolutional layers
    conv_layers = util.get_layers(net, lambda x: type(x) == type(net.conv1_3))

    # Get conv parameters
    conv_params = []
    for c in conv_layers:
        for p in c.parameters():
            if (p.requires_grad):
                conv_params.append(p)

    # Get a list of trainable layers that are not convolutional
    other_layers = util.get_layers(
        net,
        lambda x: type(x) != type(net.conv1_3) and hasattr(x, 'parameters'))
    other_layers = other_layers[1:]  # The first layer is attend_GRU.AGRU

    # Get GRU parameters
    gru_params = []
    for l in other_layers:
        for p in l.parameters():
            gru_params.append(p)

    # Set different learning rates for conv layers and GRU layers
    optimizer = optim.Adam([{
        'params': gru_params
    }, {
        'params': conv_params,
        'lr': lr
    }],
                           lr=lr)

    # Loss function
    criterion = nn.CrossEntropyLoss(ignore_index=1)

    # Get full paths to train inkml files, create a list of scale factors to be used for rendering train images
    inkml_list = []
    scale_list = []

    for i, subset in enumerate(subset_list):
        subset_inkml_list = glob.glob(dataset_path + subset + '*.inkml')
        inkml_list += subset_inkml_list
        scale_list += [scale_factors[i]] * len(subset_inkml_list)
    inkml_list = np.asarray(inkml_list)
    scale_list = np.asarray(scale_list)

    #inkml_list = inkml_list[0:120]
    #scale_list = scale_list[0:120]
    num_train = len(inkml_list)
    num_ite = int(np.ceil(1.0 * num_train / batch_size_const))

    # Main train loop
    optimizer.zero_grad()
    for e in range(last_e + 1, num_e):
        permu_ind = np.random.permutation(range(num_train))
        inkml_list = inkml_list[permu_ind.astype(int)]
        scale_list = scale_list[permu_ind.astype(int)]

        if (e % cfg.NUM_EPOCH_TO_DECAY == cfg.NUM_EPOCH_TO_DECAY - 1):
            lr = lr * lr_decay
            print('Current learning rate: %.8f' % lr)
            optimizer.param_groups[0]['lr'] = lr
            optimizer.param_groups[1]['lr'] = lr

        for i in range(num_ite):

            batch_idx = range(i * batch_size_const, (i + 1) * batch_size_const)
            if (batch_idx[-1] >= num_train):
                batch_idx = range(i * batch_size_const, num_train)
            batch_size = len(batch_idx)
            batch_x = util.batch_data(inkml_list[batch_idx],
                                      scale_list[batch_idx], is_train)
            batch_y_np = util.batch_target(inkml_list[batch_idx])
            batch_y = util.np_to_var(batch_y_np, use_cuda)

            pred_y, attention = net(batch_x, batch_y)

            # Convert the 3D tensor to 2D matrix of shape (batch_size*MAX_TOKEN_LEN, NUM_OF_TOKEN) to compute log loss
            pred_y = pred_y.view(-1, num_token)
            # Remove the <start> token from target vector & prediction vvector
            batch_y = batch_y.view(batch_size, max_len)
            batch_y = batch_y[:, 1:].contiguous()
            batch_y = batch_y.view(-1)
            pred_y = pred_y.view(batch_size, max_len, num_token)
            pred_y = pred_y[:, 1:].contiguous()
            pred_y = pred_y.view(batch_size * (max_len - 1), num_token)

            loss = criterion(pred_y, batch_y)
            loss.backward()
            running_loss += loss.data[0]

            if (global_step % num_ite_to_update == (num_ite_to_update - 1)):
                util.grad_clip(net, max_grad)
                optimizer.step()
                optimizer.zero_grad()
                running_loss /= num_ite_to_update
                all_loss.append(running_loss)
                running_loss = 0

            # Printing stuffs to console
            if (global_step % num_ite_to_log == (num_ite_to_log - 1)):
                print('Finished ite %d/%d, epoch %d/%d, loss: %.5f' %
                      (i, num_ite, e, num_e, all_loss[-1]))

                # Printing prediction and target
                pred_y_np = util.var_to_np(pred_y, use_cuda)
                pred_y_np = np.reshape(pred_y_np,
                                       (batch_size, max_len - 1, num_token))
                # Only display the first sample in the batch
                pred_y_np = pred_y_np[0, 0:40, :]
                pred_y_np = np.argmax(pred_y_np, axis=1)
                pred_list = [id_to_word[idx] for idx in list(pred_y_np)]
                print('Prediction: %s' % ' '.join(pred_list))

                batch_y_np = np.reshape(batch_y_np, (batch_size, max_len))
                batch_y_np = batch_y_np[0, 1:40]
                target_list = [id_to_word[idx] for idx in list(batch_y_np)]
                print('Target: %s\n' % ' '.join(target_list))

            if (global_step % num_ite_to_vis == (num_ite_to_vis - 1)):
                tmp_x = util.var_to_np(batch_x, use_cuda)[0, :, :, :]
                tmp_x = np.transpose(tmp_x, (1, 2, 0))[:, :, 0:3]
                attention_np = attention.data.cpu().numpy()[0, 2:, :, :]
                for k in range(10):
                    attention_k = attention_np[k, :, :] / np.max(
                        attention_np[k, :, :]) * 0.8
                    attention_k = (scipy.misc.imresize(
                        attention_k, 16.0, interp='bicubic')) / 255.0
                    tmp_x = scipy.misc.imresize(tmp_x, attention_k.shape)
                    attention_k = np.repeat(np.expand_dims(attention_k, 2), 3,
                                            2)
                    attention_k = attention_k * 255
                    attention_k += tmp_x
                    attention_k /= 2.0
                    attention_k[attention_k > 255] = 255
                    attention_k = (attention_k).astype(np.uint8)
                    scipy.misc.imsave(vis_path + ('%02d.jpg' % k), attention_k)

                plt.clf()
                plt.plot(all_loss)
                plt.show()
                plt.savefig(vis_path + 'loss.png')

            global_step += 1

        if (e % num_epoch_to_save == (num_epoch_to_save - 1)):
            print('Saving at epoch %d/%d' % (e, num_e))
            torch.save(
                {
                    'state_dict': net.state_dict(),
                    'opt': optimizer.state_dict()
                }, save_path + save_name + ('_%03d' % e) + '.dat')
            metadata = [e, running_loss, all_loss, lr]
            util.save_list(metadata,
                           save_path + meta_name + ('_%03d' % e) + '.dat')

        last_e = e
Ejemplo n.º 15
0
def color_cdt(s_file,
              exps=None,
              exp_bgcolor=None,
              genes=None,
              gene_bgcolor=None):
    if not s_file.endswith('.cdt'):
        s_file += '.cdt'
    if not os.path.exists(s_file):
        util.error_msg("File not exist: " + s_file + "!")
    BG = '#ffffff'
    f = open(s_file)
    S = []
    c_first = {}
    i = 0
    while True:
        line = f.readline()
        if not line: break
        SS = line.strip().split("\t")
        c_first[SS[0]] = i
        i += 1
        S.append(SS)
    f.close()
    S_header = S[0]
    i_gene = util.index('GENE', S_header)
    i_name = util.index('NAME', S_header)
    i_gid = util.index('GID', S_header)
    i_w = util.index("GWEIGHT", S_header)
    offset = max([i_gene, i_name, i_gid, i_w]) + 1
    n_exp = len(S_header) - offset
    if 'EWEIGHT' not in c_first:
        # add EWEIGHT ROW
        i_w = max([c_first.get('GID', -1), c_first.get('AID', -1)]) + 1
        S.insert(i_w, ['EWEIGHT'] + [''] * (offset - 1) + ['1.000'] * n_exp)
        c_first['EWEIGHT'] = i_w
    i_w = util.index("GWEIGHT", S_header)
    if i_w < 0:  # add GWEIGHT column
        i_w = offset
        S_header.insert(i_w, 'GWEIGHT')
        for i in range(1, len(S)):
            if i <= c_first['EWEIGHT']:
                S[i].insert(i_w, '')
            else:
                S[i].insert(i_w, '1.000')
        offset += 1
    i_gene_color = util.index('BGCOLOR', S_header)
    if i_gene_color < 0 and genes is not None:
        i_gene_color = offset - 1
        S_header.insert(i_gene_color, 'BGCOLOR')
        offset += 1
        for i in range(1, len(S)):
            if i <= c_first['EWEIGHT']:
                S[i].insert(i_gene_color, '')
            else:
                S[i].insert(i_gene_color, BG)
    i_exp_color = c_first.get('BGCOLOR', -1)
    if i_exp_color < 0 and exps is not None:
        i_exp_color = c_first['EWEIGHT']
        S.insert(i_exp_color, ['BGCOLOR'] + [''] * (offset - 1) + [BG] * n_exp)
        c_first['EWEIGHT'] += 1
    if genes is not None:
        c_m = Tree.color_map(genes, gene_bgcolor)
        idx = i_gene if i_gene >= 0 else i_name
        for i in range(c_first['EWEIGHT'] + 1, len(S)):
            S[i][i_gene_color] = c_m.get(S[i][idx], BG)
    if exps is not None:
        c_m = Tree.color_map(exps, exp_bgcolor)
        SS = S[c_first['EWEIGHT'] - 1]
        for i in range(offset, len(SS)):
            SS[i] = c_m.get(S_header[i], BG)
    S = ["\t".join(X) for X in S]
    util.save_list(s_file, S, s_end="\n")
Ejemplo n.º 16
0
 def _insert_array_line(s_file, s):
     with open(s_file) as f:
         lines = f.readlines()
     lines[1:0]=[s]
     util.save_list(s_file, lines)
Ejemplo n.º 17
0
    def plot(self, karyotype="", symbol=None, links=None, hits=None, outputdir=None, outputfile="CircosPlot"):
        #sw=util.StopWatch()
        outputdir=outputdir if outputdir is not None else '/tmp'
        for ext in [".png" , ".svg"]:
            s_file=os.path.join(outputdir, outputfile+ext)
            if os.path.exists(s_file):
                os.remove(s_file)
        if links is None:
            util.warn_msg('No link to plot, simply ignore')
            #return
        tmp=tempfile.NamedTemporaryFile(dir=outputdir, delete=False, prefix="CIRCOS_", suffix=".txt")
        conf_file=tmp.name
        S_tmp_file=[conf_file]

        s_conf=util.read_string(self.TEMPLATE)
        kary_file=re.sub('CIRCOS_', 'CIRCOS_KARYOTYPE_', conf_file)
        util.save_string(kary_file, karyotype)
        S_tmp_file.append(kary_file)
        s_conf=re.sub(r'@KARYOTYPE@', kary_file, s_conf)

        r0=0.90
        s_plot=""
        if hits is None:
            hits=[]
        elif type(hits) is not list:
            hits=[hits]
        for i,s_hit in enumerate(hits):
            hit_file=re.sub('CIRCOS_', 'CIRCOS_HIT_%d_' % i, conf_file)
            util.save_list(hit_file, s_hit)
            S_tmp_file.append(hit_file)
            s_plot+="<plot>\n"
            s_plot+="file = "+hit_file+"\n"
            s_plot+="r0 = "+('%.3f' % r0)+"r\n"
            s_plot+="r1 = "+('%.3f' % r0)+"r+70p\n"
            s_plot+="stroke_thickness = 0\n"
            s_plot+="min = 0\n"
            s_plot+="max = 2\n"
            s_plot+="color = oranges-3-seq\n"
            s_plot+="</plot>\n\n"
            r0-=0.05;

        s_conf=re.sub(r'@PLOTS@', s_plot, s_conf)
        #t_chr=pd.read_csv(os.path.join(Circos.HOME, "karyotype_"+pid+".tmp"), sep=r'\s+', header=None)
        #s_conf=re.sub(r'@CHROMOSOMES@', ";".join(t_chr[2]), s_conf)
        #avoid using Pandas, so that this script can be used in CGI on ldweb server, where numpy is not installed correctly
        S=karyotype.split("\n")
        S_chr=[]
        for s in S:
            if s.strip()=='': break
            S_chr.append(re.split(Circos.DELIMITER, s)[2])
        s_conf=re.sub(r'@CHROMOSOMES@', ";".join(S_chr), s_conf)
        s_symbol=""
        if symbol is not None:
            symbol_file=re.sub('CIRCOS_', 'CIRCOS_SYMBOL_', conf_file)
            util.save_string(symbol_file, symbol)
            S_tmp_file.append(symbol_file)
            s_symbol+="<plot>\n"
            s_symbol+="type = text\n"
            s_symbol+="color = black\n"
            s_symbol+="file = "+symbol_file+"\n"
            s_symbol+="r0=1.02r\n"
            s_symbol+="r1=1.2r\n"
            s_symbol+="label_size = 12p\n"
            s_symbol+="label_font = condensed\n"
            s_symbol+="padding = 0p\n"
            s_symbol+="rpadding = 0p\n"
            s_symbol+="</plot>\n"
        s_conf=re.sub(r'@SYMBOL@', s_symbol, s_conf)

        S_color=['107,174,214', '116,196,118', '106,81,163']
        s_link=""
        MAX_EDGES=10000 # Circos does not seem to work well with too many edges, it will not draw edges after maybe 20000-ish
        if links is not None:
            if type(links) is str: links=[links]
            for i in range(len(links)-1, -1, -1):
                link_file=re.sub('CIRCOS_', 'CIRCOS_LINK%02d_' % (i+1), conf_file)
                S_tmp_file.append(link_file)
                S_edges=links[i].strip().split("\n")
                n_edge=len(S_edges)/2
                if n_edge>MAX_EDGES:
                    # randomly sample a subset

                    IDX=np.repeat(np.random.permutation(list(range(0,len(S_edges),2)))[:MAX_EDGES], 2)
                    IDX[list(range(1,len(IDX),2))]+=1
                    S_in=[x for x in IDX if  x >=len(S_edges) ]
                    S_edges=pd.Series(S_edges)[IDX].astype(str)
                    links[i]="\n".join(S_edges)
                util.save_string(link_file, links[i])
                s_link+="<link link"+str(i+1)+">\n"
                s_link+="show = yes\n"
                s_link+="color = "+S_color[(i+len(S_color)-1)%len(S_color)]+"\n"
                s_link+="file = "+link_file+"\n"
                s_link+="</link>\n\n"
        s_conf=re.sub(r'@LINKS@', s_link, s_conf)
        #print s_conf
        util.save_string(conf_file, s_conf)
        #print s_conf
        ## run Circos
        #s_cmd = "cd "+os.path.join(os.path.dirname(__file__), "circos")+"; "
        s_cmd=self.BIN+" -conf "+conf_file
        s_cmd+=' -outputdir '+outputdir
        s_cmd+=' -outputfile '+outputfile
        #sw.check('prepare conf file')
        print(s_cmd)
        util.unix(s_cmd, l_print=False, l_error=False)
        l_remove_temp=True
        if l_remove_temp:
            for f in S_tmp_file:
                os.remove(f)
        s_file=os.path.join(outputdir, outputfile+".png")
        #sw.check('make circos image')
        if os.path.exists(s_file):
            return s_file
        else:
            return None