Example #1
0
def get_prediction(file_path):

    # predictions = np.loadtxt("prob_result.csv")
    predictions = predict.get_predictions(file_path)
    labels = predict.get_labels()
    print predictions.shape

    file_path = file_path + "?cachebuster=%s" % time.time()
    result = {
        "video": {
            "url": "%s" % file_path,
            "framerate": 25
        },
        "frames": []
    }

    for index, row in enumerate(predictions):

        pred_per_label = []

        five_best = np.argpartition(row, -5)[-5:]
        for i in five_best:
            pred_per_label.append({"label": labels[i], "prob": row[i]})

        new_frame = {"frameNumber": index, "predictions": pred_per_label}

        result["frames"].append(new_frame)

    return result
Example #2
0
def update_table(n_clicks):
    if n_clicks == 0:
        raise PreventUpdate
    else:
        predictions = predict.get_predictions(client=client,
                                              features_df=features)
        results_table = pd.merge(predictions, table, on='author')
        results_table = results_table[[
            'Default Likelihood_x', 'author', 'title', 'full_link',
            'created_utc'
        ]]
        columns = [{
            "name": 'Default Likelihood',
            "id": 'Default Likelihood_x'
        }, {
            "name": 'Borrower',
            "id": 'author'
        }, {
            "name": 'Post Title',
            "id": 'title'
        }, {
            "name": 'Link',
            "id": 'full_link',
            'presentation': 'markdown'
        }, {
            "name": 'Create Date UTC',
            "id": 'created_utc'
        }]
        return (results_table.to_dict('records'), columns)
Example #3
0
def eval_epoch(model, data_loader, device):
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for data in data_loader:
            tokens_tensors, segments_tensors, masks_tensors, labels = [
                t.to(device) for t in data
            ]

            # Forwarding, ouput: (loss), (logits), (hidden_states), (attentions)
            outputs = model(input_ids=tokens_tensors,
                            token_type_ids=segments_tensors,
                            attention_mask=masks_tensors,
                            labels=labels)
            loss = outputs[0]
            val_loss += loss.item()

    # Compute accuray and average loss
    _, val_acc = predict.get_predictions(model,
                                         data_loader,
                                         device,
                                         compute_acc=True)
    val_avg_loss = np.mean(val_loss) / len(data_loader)

    return val_acc, val_avg_loss
Example #4
0
def train_epoch(model, data_loader, device):
    model.train()
    #optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
    train_loss = 0.0
    optimizer = AdamW(model.parameters(), lr=1e-5)

    for data in data_loader:
        tokens_tensors, segments_tensors, masks_tensors, labels = [
            t.to(device) for t in data
        ]
        optimizer.zero_grad()

        # Forwarding, ouputs: (loss), (logits), (hidden_states), (attentions)
        outputs = model(input_ids=tokens_tensors,
                        token_type_ids=segments_tensors,
                        attention_mask=masks_tensors,
                        labels=labels)
        loss = outputs[0]
        train_loss += loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    # Compute accuray and average loss
    _, train_acc = predict.get_predictions(model,
                                           data_loader,
                                           device,
                                           compute_acc=True)
    train_avg_loss = np.mean(train_loss) / len(data_loader)

    return train_acc, train_avg_loss
Example #5
0
def get_prediction(file_path):


    # predictions = np.loadtxt("prob_result.csv")
    predictions = predict.get_predictions(file_path)
    labels = predict.get_labels()
    print predictions.shape

    file_path = file_path + "?cachebuster=%s" % time.time()
    result = {
        "video" : {
            "url" : "%s" % file_path,
            "framerate" : 25
        },
        "frames" : []
    }


    for index, row in enumerate(predictions):

        pred_per_label = []

        five_best = np.argpartition(row, -5)[-5:]
        for i in five_best:
            pred_per_label.append({"label" : labels[i], "prob" : row[i]})

        new_frame = {
            "frameNumber" : index,
            "predictions" : pred_per_label
        }

        result["frames"].append(new_frame)


    return result
Example #6
0
def main():

    model = load_cnn_imdb()
    labels = load_cnn_imdb_labels()
    word_index = load_imdb_word_index()
    features = load_imdb_keywords()

    m = [model, labels, word_index]

    for filename in os.listdir('reviews/'):
        filename = filename.split('.')[0]
        news_file = open("reviews/" + filename + ".txt", "r", encoding="utf8")
        text = news_file.read().replace('\n', '')
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        filtered_tokens = [
            word.lower() for word in tokens
            if word not in stopwords.words('english')
        ]
        lmtzr = WordNetLemmatizer()
        lems = [lmtzr.lemmatize(t) for t in filtered_tokens]

        instance_features = []

        for l in lems:
            for word in features.keys():
                if (l == word and l not in instance_features):
                    instance_features.append(word)

        perturbated = {}

        for f in instance_features:
            p = []
            for t in tokens:
                if (t != f):
                    p.append(t)
            perturbated[f] = " ".join(p)

        original_text = " ".join(tokens)

        out_file = open("results\perturbated_" + filename + ".txt", "w")
        out_file.write(text + "\n\n")
        for p in perturbated:
            out_file.write(p + "\n" + perturbated[p] + "\n\n")
        out_file.close()

        get_predictions(original_text, perturbated, filename, m)
Example #7
0
def main():
    for i in range(10):
        model_path = f'../models/chukchi/chukchi_{i}'
        test_path = f'../data/Chukchi/flat_test/ckt_hse-ud-test.0{i}.conllu'
        savepath = f'../results/chukchi/test_result_{i}.conllu'

        config = Params.from_file(Path(model_path, 'config.json'))
        dataset_reader = DatasetReader.from_params(config['dataset_reader'])

        model = Model.load(config,
                           serialization_dir=model_path,
                           cuda_device=-1)
        model.eval()

        predictor = UniversalDependenciesCharacterLevelPredictor(
            model=model, dataset_reader=dataset_reader)

        predictions = get_predictions(test_path, dataset_reader, predictor)
        save_predictions_to_conllu(savepath, predictions)
Example #8
0
def score():
    jsons = json.dumps(request.json, sort_keys=True, indent=4, separators=(',', ':'))
    m = hashlib.sha224(jsons).hexdigest()
    if col.find_one({'_id': m}) is None:
       print "New Point"
       timepoint = time.time()

       datapoint_dict = json.loads(jsons)
       df = pd.DataFrame.from_dict(datapoint_dict, orient='index').T
       df = predict.clean_json(df)
       predictions = predict.get_predictions(df, model)
       datapoint_dict['predicts'] = tuple(predictions[0])
       datapoint_dict['time'] = timepoint

       datapoint_json = json.dumps(datapoint_dict)

       col.insert_one({'_id': m, 'data': datapoint_json})
    else:
        print "Seen this point"
    return ""
Example #9
0
def score():
    jsons = json.dumps(request.json,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ':'))
    m = hashlib.sha224(jsons).hexdigest()
    if col.find_one({'_id': m}) is None:
        print "New Point"
        timepoint = time.time()

        datapoint_dict = json.loads(jsons)
        df = pd.DataFrame.from_dict(datapoint_dict, orient='index').T
        df = predict.clean_json(df)
        predictions = predict.get_predictions(df, model)
        datapoint_dict['predicts'] = tuple(predictions[0])
        datapoint_dict['time'] = timepoint

        datapoint_json = json.dumps(datapoint_dict)

        col.insert_one({'_id': m, 'data': datapoint_json})
    else:
        print "Seen this point"
    return ""
def run():
    
    print ("Importing the datasets...\n")
    df = pd.read_csv(config.Training_file)

    # Multi-Genre NLI Corpus
    df_mnli = data_augmentation.load_mnli()

    # Cross-Lingual NLI Corpus
    df_xnli = data_augmentation.load_xnli()

    df.drop(columns = ['id'], inplace = True)
    
    print ("Imported...\n")

    print ("Processing and doing Data Augmentation ...\n")
    
    print ("back translation started...\n")
    
    df = data_augmentation.proc(df)

    print ("Back Translation Done...\n")

    df = pd.concat([df, df_mnli, df_xnli], ignore_index = True)

    # Shuffling the dataframe
    df = resample(df, random_state = config.RANDOM_SEED)
    df['premise'] = df['premise'].astype(str)
    df['hypothesis'] = df['hypothesis'].astype(str)

    df.drop_duplicates(subset = ['premise', 'hypothesis'], inplace = True)
    
    combined_thesis = df[['premise', 'hypothesis']].values.tolist()
    
    df['combined_thesis'] = combined_thesis
    
    df_train, df_test = train_test_split(
        df,
        test_size = 0.3,
        random_state = 0
        )
    
    df_test, df_val = train_test_split(
        df_test,
        test_size = 0.4,
        random_state = 0
        )
    
    
    train_data_loader = dataset.create_data_loader(df_train, config.tokenizer, config.max_len, config.batch_size)
    test_data_loader = dataset.create_data_loader(df_test, config.tokenizer, config.max_len, config.batch_size)
    val_data_loader = dataset.create_data_loader(df_val, config.tokenizer, config.max_len, config.batch_size)
    
    print ("Processing and Data Augmentation Done...\n")

    # we save the whole model of roberta_model in the model var which is further used in the code
    device = torch.device(config.DEVICE)
    model = roberta_model(3)
    model = model.to(device)
    
    #optimizer, scheduler and loss_fn used in the train_roberta_epoch and eval_roberta
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * config.EPOCHS

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )
    loss_fn = nn.CrossEntropyLoss().to(device)
    
    history = defaultdict(list)
    best_accuracy = 0
    
    print ("Training...\n")
    for epochs in range(config.EPOCHS):
        
        print ('Epoch {} \n'.format(epochs + 1))
        print ("-"*100)
        
        train_acc, train_loss = engine.train_roberta_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(df_train)
            )
        
        print ('train_acc {} train_loss {}'.format(train_acc, train_loss))
        
        val_acc, val_loss = engine.eval_roberta(
            model,
            val_data_loader,
            loss_fn,
            device,
            len(df_val)
            ) 
        
        print ('val_acc {} val_loss {}'.format(val_acc, val_loss))
        
        print ()
        
        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)
        
        if val_acc > best_accuracy:
            
            # here model.state_dict() will save the model and optimizer's parameter
            torch.save(model.state_dict(), 'best_model.bin')
            
            best_accuracy = val_acc
        
    print ("Training completed...")

    print ("Testing...\n")

    # predictions
    y_thesis, y_pred, y_pred_probs, y_test = predict.get_predictions(
    model,
    test_data_loader,
    device
    )

    # Classification_report
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    ln = []
    # tn, fp, fn, tp 
    ln = confusion_matrix(y_test, y_pred).ravel()
    print ("True Negative, False Positive, False Negative, True positive : ", ln)
    print ()
    
    # Accuracy
    print (accuracy_score(y_test, y_pred))
    print()

    print ("DONE!!")
Example #11
0
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])
plt.grid()
plt.savefig('acc_history.png')
plt.clf()

plt.plot(history['train_loss'], label='train loss')
plt.plot(history['val_loss'], label='val loss')
plt.title('Loss history')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid()
plt.savefig('loss_history.png')

# Inference with test set
test_set = CovidDataset(df_test, tokenizer=tokenizer)
test_loader = DataLoader(test_set,
                         batch_size=256,
                         collate_fn=batch.create_mini_batch)
predictions = predict.get_predictions(
    model, test_loader, device
)  ### Currently have a bug here, why prediction get tuple wite 2 same predicted results?

# Concat predition to .csv
df_pred = df_test
df_pred['prediction'] = predictions[0].tolist()
df_pred.to_csv('predict.csv', index=False)