Python get_data Beispiele, data_handler.get_data Python Beispiele

Beispiel #1

0

Datei anzeigen

def analyse_subject(subject, paramspath, pipesdir, resultsdir):

    yml = load_yml(paramspath)
    data_params, analysis_params = parse_yml_params(yml)
    data_params['subject'] = subject

    X, y = get_data(data_params)

    pipelines = OrderedDict()
    pipepaths = sorted(glob.glob(pipesdir + 'pipeline_*'))

    for i, ymlpath in enumerate(pipepaths):
        yml = load_yml(ymlpath)
        label = yml['label']
        pipelines[label] = parse_yml_pipeline(yml)

    print ''
    print 'Processing ' + str(len(pipepaths)),
    print 'pipelines for subject ' + str(subject)
    print ''
    scores = crossvalidation(X, y, pipelines, analysis_params)

    resultsdir = resultsdir + 'subject' + str(subject) + '/'
    if not os.path.exists(resultsdir):
        os.makedirs(resultsdir)

    for score, rstpath in zip(scores, pipepaths):
        rstpath = rstpath.split('/')[-1]
        rstpath = string.join([rstpath.split('.')[0]] + ['pkl'], '.')
        rstpath = resultsdir + rstpath
        joblib.dump(score, rstpath)

Beispiel #2

0

Datei anzeigen

def get_features(regenerate=True):
    if regenerate:
        agg, log, flg = get_data()
        features = flg.loc[:, 'USRID':'USRID']
        all_user_id = flg.loc[:, 'USRID':'USRID']
        feature_types = list()
        for feature in FEATURE_LIST:
            print(feature[0])
            feature_val, feature_type = feature[1](agg, log, all_user_id)
            features = pd.merge(features, feature_val, on=['USRID'], how='left')
            feature_types += feature_type
        features.to_csv('./feature/features.csv')
        with open('./feature/feature_types', 'wb') as f:
            pickle.dump(feature_types, f)
        flg.to_csv('./feature/flg.csv')
    else:
        features = pd.read_csv('./feature/features.csv', index_col=0)
        with open('./feature/feature_types', 'rb') as f:
            feature_types = pickle.load(f)
        flg = pd.read_csv('./feature/flg.csv', index_col=0)

    features = features.reset_index(drop=True)
    flg = flg.reset_index(drop=True)
    train_features = features[flg['FLAG'] != -1]
    test_features = features[flg['FLAG'] == -1]
    train_flg = flg[flg['FLAG'] != -1]
    test_flg = flg[flg['FLAG'] == -1]

    train = [train_features, train_flg]
    test = [test_features, test_flg]
    return train, test, feature_types

Beispiel #3

0

Datei anzeigen

def gen_data():
    label_map = {'none': 0, 'racism': 1, 'sexism': 2}
    tweet_data = get_data()
    for tweet in tweet_data:
        texts.append(tweet['text'].lower())
        labels.append(label_map[tweet['label']])
    print('Found %s texts. (samples)' % len(texts))

Beispiel #4

0

Datei anzeigen

def main_fast_text():

    tweet_data = get_data()
    for tweet in tweet_data:
        texts.append(tweet['text'])
        labels.append(label_map[tweet['label']])
    print('Found %s texts. (samples)' % len(texts))

    EMBEDDING_DIM = 25
    GLOVE_MODEL_FILE = "glove.twitter.27B.25d.txt"

    tokenizer = "glove"
    if tokenizer == "glove":
        TOKENIZER = glove_tokenize
    elif tokenizer == "nltk":
        TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(GLOVE_MODEL_FILE)

    tweets = select_tweets(TOKENIZER, word2vec_model)

    gen_vocab(TOKENIZER, tweets)
    X, y = gen_sequence(TOKENIZER, tweets)
    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
    print("max seq length is %d"%(MAX_SEQUENCE_LENGTH))
    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    y = np.array(y)
    W = get_embedding_weights(EMBEDDING_DIM, word2vec_model)
    data, y = sklearn.utils.shuffle(data, y)
    model = fast_text_model(data.shape[1], EMBEDDING_DIM)
    _ = train_fasttext(data, y, model, EMBEDDING_DIM, W)
    table = model.layers[0].get_weights()[0]
    pdb.set_trace()

Beispiel #5

0

Datei anzeigen

def test(dataset, ckpt):
    """
        Train the model
        **input: **
            *dataset: (String) Dataset folder to used
            *ckpt: (String) [Optional] Path to the ckpt file to restore
    """

    # Load name of id
    with open("signnames.csv", "r") as f:
        signnames = f.read()
    id_to_name = {
        int(line.split(",")[0]): line.split(",")[1]
        for line in signnames.split("\n")[1:] if len(line) > 0
    }

    # Get Test dataset
    _, _, _, _, X_test, y_test = get_data(dataset)
    X_test = X_test / 255

    model = ModelTrafficSign("TrafficSign", output_folder=None)
    # Load the model
    model.load(ckpt)

    # Evaluate all the dataset
    loss, acc, predicted_class = model.evaluate_dataset(X_test, y_test)

    print("Accuracy = ", acc)
    print("Loss = ", loss)

    # Get the confusion matrix
    cnf_matrix = confusion_matrix(y_test, predicted_class)
    np.savetxt("cnf.txt", cnf_matrix)

Beispiel #6

0

Datei anzeigen

def select_tweet_frame(filename):
    if filename == 'tokenized_tweets_train.txt':
        train_tweets = get_data('tokenized_tweets_train.txt')
    elif filename == 'tokenized_tweets_test.txt':
        test_tweets = get_data('tokenized_tweets_test.txt')
    tweet_return = []
    if filename == 'tokenized_tweets_train.txt':
        for tweet, frame in zip(train_tweets,open('frames.txt','r')):
            tweet_return.append((tweet,frame.strip()))
        print('Tweets selected:', len(tweet_return))
        return tweet_return
    else:
        for tweet, frame in zip(test_tweets,open('frames_test.txt','r')):
            tweet_return.append((tweet,frame.strip()))
        print('Tweets selected:', len(tweet_return))
        return tweet_return

Beispiel #7

0

Datei anzeigen

Datei: cnn.py Projekt: viniciusmss/Hate-Compare

def select_tweets():
    # selects the tweets as in mean_glove_embedding method
    # Processing

    tweet_return_file = "cnn_tweets.pickle"

    # Load if pickled files are available
    try:
        tweet_return = pickle.load(open(tweet_return_file, "rb"))
        print "Tweets loaded from pickled file."

    # Create and save otherwise
    except (OSError, IOError) as e:

        print "Loading tweets with embeddings available..."
        tweets = get_data()
        tweet_return = []
        for tweet in tweets:
            _emb = 0
            words = TOKENIZER(tweet['text'].lower())
            for w in words:
                if w in word2vec_model:  # Check if embeeding there in GLove model
                    _emb += 1
            if _emb:  # Not a blank tweet
                tweet_return.append(tweet)

        pickle.dump(tweet_return, open(tweet_return_file, "wb"))
    print 'Tweets selected:', len(tweet_return)
    return tweet_return

Beispiel #8

0

Datei anzeigen

def save_selected_feature_results_to_sql(selected_feature_sets):
    name, features = selected_feature_sets

    full_feature_set = models.FEATURE_SETS
    new_feature_set = ['none']

    classifiers = models.CLASSIFIERS

    prefix = "results_%s" % name

    unselected_feature_sets = [
        f for f in full_feature_set if f not in features
    ]

    if "halves" in features:
        polynomial_terms = feature_set_list.halves_features()
    else:
        polynomial_terms = None

    to_drop = []

    for feature_set in unselected_feature_sets:
        if feature_set == "cfg":
            to_drop += feature_set_list.cfg_features()
        elif feature_set == "syntactic_complexity":
            to_drop += feature_set_list.syntactic_complexity_features()
        elif feature_set == "psycholinguistic":
            to_drop += feature_set_list.psycholinguistic_features()
        elif feature_set == "vocabulary_richness":
            to_drop += feature_set_list.vocabulary_richness_features()
        elif feature_set == "repetitiveness":
            to_drop += feature_set_list.repetitiveness_features()
        elif feature_set == "acoustics":
            to_drop += feature_set_list.acoustics_features()
        elif feature_set == "demographic":
            to_drop += feature_set_list.demographic_features()
        elif feature_set == "parts_of_speech":
            to_drop += feature_set_list.parts_of_speech_features()
        elif feature_set == "information_content":
            to_drop += feature_set_list.information_content_features()
        elif feature_set == "strips":
            to_drop += feature_set_list.strips_features()
        elif feature_set == "halves":
            to_drop += feature_set_list.halves_features()
        elif feature_set == "quadrant":
            to_drop += feature_set_list.quadrant_features()

    for feature_set in new_feature_set:
        print 'Saving features: %s' % name
        X, y, labels = data_handler.get_data(drop_features=to_drop,
                                             polynomial_terms=polynomial_terms)
        print "Number of features used: ", len(X.values[0])
        trained_models = {
            model: DementiaCV(classifiers[model], X=X, y=y,
                              labels=labels).train_model('default')
            for model in classifiers
        }

        save_models_to_sql_helper(trained_models, prefix)

Beispiel #9

0

Datei anzeigen

Datei: taskB.py Projekt: ykumthekar4929/Data_Mining

def driver(classifier):
    print (getTitle(classifier))
    if classifier == 4:
        trainX, trainY, testX, testY = data_handler.splitData2TestTrain('ATNTFaceImages400.txt', 10, '1:10')
        print ("\nAverage Accuracy for 5 folds: %s"% SVM.cross_validate(trainX, trainY, testX, testY))
    else:
        data, indexes = data_handler.get_data("ATNTFaceImages400.txt")
        print ("\nAverage Accuracy for 5 folds: %s"%cross_validator(5, data, indexes, classifier))

Beispiel #10

0

Datei anzeigen

def callback():
    # Auth Step 4: Requests refresh and access tokens
    global auth_token, post_request

    #check to see if we already received authorization
    if auth_token == None:
        auth_token = request.args['code']
        code_payload = {
            "grant_type": "authorization_code",
            "code": str(auth_token),
            "redirect_uri": REDIRECT_URI,
            'client_id': CLIENT_ID,
            'client_secret': CLIENT_SECRET,
        }
    if post_request == None:
        post_request = requests.post(SPOTIFY_TOKEN_URL, data=code_payload)

    # Auth Step 5: Tokens are Returned to Application
    response_data = json.loads(post_request.text)
    access_token = response_data["access_token"]
    refresh_token = response_data["refresh_token"]
    token_type = response_data["token_type"]
    expires_in = response_data["expires_in"]

    dh.get_data(access_token, TIME_RANGE)

    # Retrieve and populate the datasets to display on the page
    top_track_names = dh.get_top_track_names()
    top_artist_names = dh.get_top_artist_names()
    top_artist_image = dh.get_top_artist_image()
    genres_data = dh.get_top_genres_data()

    viz.create_top_genres_pie_chart(genres_data, GENRE_PIE_CHART_FILE_PATH)
    viz.create_acoustic_vs_non_acoustic_pie_chart(
        dh.get_acoustic_data(), ACOUSTIC_PIE_CHART_FILE_PATH)
    viz.create_live_vs_studio_pie_chart(dh.get_live_data(),
                                        LIVE_PIE_CHART_FILE_PATH)

    top_genres = genres_data['top_50_genres_list']

    return render_template('stat-query.html',
                           artists=top_artist_names,
                           tracks=top_track_names,
                           top_artist_image=top_artist_image,
                           genres=top_genres)

Beispiel #11

0

Datei anzeigen

Datei: face_analyzer.py Projekt: ebriant/FaceTracking

 def __init__(self):
     print(os.path.join(config.out_dir, "171214_1.txt"))
     self.visualizer = visualization.VisualizerOpencv()
     self.position_data = data_handler.get_data(os.path.join(config.out_dir, "171214_1.txt"))
     self.face_aligner = FaceAlignment(LandmarksType._3D, device='cuda:0', flip_input=True)
     self.positions = {}
     self.s_frames = utils.load_seq_video()
     self.data = {name: [] for name in self.position_data}
     self.cur_img = None

Beispiel #12

0

Datei anzeigen

def select_tweets():
    # selects the tweets as in mean_glove_embedding method
    train_tweets, test_tweets = get_data()
    tweet_return = []
    for tweet in train_tweets:
        _emb = 0
        words = TOKENIZER(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embedding there in GLove model
                _emb += 1
        if _emb:
            tweet_return.append(tweet)

    return tweet_return, test_tweets

Beispiel #13

0

Datei anzeigen

def select_tweets(filename):
    # selects the tweets as in mean_glove_embedding method
    # Processing
    if filename == 'tokenized_tweets_train.txt':
        train_tweets = get_data('tokenized_tweets_train.txt')
    elif filename == 'tokenized_tweets_test.txt':
        test_tweets = get_data('tokenized_tweets_test.txt')
    tweet_return = []
    if filename == 'tokenized_tweets_train.txt':
        c = 1
        for tweet in train_tweets:
            _emb = 0
            words = glove_tokenize(tweet['text'].lower())
            for w in words:
                if w in word2vec_model:  # Check if embeeding there in GLove model
                    _emb+=1
            c = c+1
            # if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
        print('Tweets selected:', len(tweet_return))
        #pdb.set_trace()
        return tweet_return
    else:
        c = 1
        for tweet in test_tweets:
            _emb = 0
            words = glove_tokenize(tweet['text'].lower())
            for w in words:
                if w in word2vec_model:  # Check if embeeding there in GLove model
                    _emb+=1
            c = c+1
            # if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
        print('Tweets selected:', len(tweet_return))
        #pdb.set_trace()
        return tweet_return

Beispiel #14

0

Datei anzeigen

def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = TOKENIZER(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print('Tweets selected:', len(tweet_return))
    return tweet_return

Beispiel #15

0

Datei anzeigen

Datei: baseline.py Projekt: kilolgupta/Abusive_Tweet_Detection

def getAbusiveFeatures():
    f = open('abusive_dict.txt', 'r')
    m = {}
    for line in f:
        line = line.strip()
        m[line] = True
    tweets = get_data()
    X = []
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        c = 0
        for word in text:
            if word in m:
                c = c + 1
        X.append(c)
    return np.array(X)

Beispiel #16

0

Datei anzeigen

def calculate_build_data():
    breed, height, mass = dta.get_data()
    model, params = fit.fit_curve(height, mass)
    height_curve, mass_curve = model.to_points(params, heigth)
    height_scale = np.var(height)
    mass_scale = np.var(mass)
    d_path = np.sqrt((np.diif(height_curve)**2) / height_scale +
                     (np.diff(maas_curve)**2) / mass_scale)
    path_position = np.cumsum(d_path)
    distance, size_indices = fit.distance_to_curve(height, mass, height_curve,
                                                   mass_curve)
    body_size = path_position[size_indeces - 1]
    build = 10 * distance / np.sqrt(body_size)
    estimated_mass = model.evaluate(params, height)
    i_negative = np.where(mass < estimated_mass)[0]
    build[i_negative] = -build[i_negative]
    return body_size, build, breed

Beispiel #17

0

Datei anzeigen

Datei: lstm.py Projekt: paulafortuna/twitter-hatespeech

def select_tweets(tokenizer, word2vec_model):
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = tokenizer(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print('Tweets selected:', len(tweet_return))
    #pdb.set_trace()
    return tweet_return

Beispiel #18

0

Datei anzeigen

def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # In this function, we are only checking whether an embedding exists
    # for at least one word within the tweet. If it does, we "accept" the tweet
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = TOKENIZER(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    return tweet_return

Beispiel #19

0

Datei anzeigen

def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        # words = glove_tokenize(tweet['text'])
        text = tweet['text'].encode("utf-8")
        words = glove_tokenize(text)
        for w in words:
            if w in vocab_json:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    #pdb.set_trace()
    return tweet_return

Beispiel #20

0

Datei anzeigen

Datei: attention_liwc_model.py Projekt: tuhinjubcse/Abusive_Tweet_Detection

def get_tfidf_features():
    tweets = get_data() # getting list of tweets (each tweet in a map format with keys text, label and user)
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    flag = True
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower()) # tokenizing like converting # into <hashtag> etc.
        text = ' '.join([c for c in text if c not in punctuation]) # removing punctuation
        X.append(text)
        y.append(y_map[tweet['label']])
    tfidf_transformer = TfidfVectorizer(ngram_range=(1,2), analyzer='word',stop_words='english',max_features=5000)
    X_tfidf = tfidf_transformer.fit_transform(X)
    print(X_tfidf.shape)

    return X_tfidf, np.array(y)

Beispiel #21

0

Datei anzeigen

Datei: baseline.py Projekt: tuhinjubcse/Abusive_Tweet_Detection

def get_tfidf_features():
    tweets = get_data()
    X, y = [], []
    flag = True
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        text = ' '.join([c for c in text if c not in punctuation])
        if y_map[tweet['label']] == 2:
            X.append(text)
            y.append(int([tweet['label']]))
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2),
                                        analyzer='word',
                                        stop_words='english',
                                        max_features=2000)
    X_tfidf = tfidf_transformer.fit_transform(X)
    print(X_tfidf.shape)

    get_top_features(tfidf_transformer)

    return X_tfidf, np.array(y)

Beispiel #22

0

Datei anzeigen

Datei: server.py Projekt: Yara-S/lunar_db_server

async def handleGet(request):

    try:
        request_type = request.rel_url.query['type']
    except:
        return web.Response(text="You must send the parameter type",
                            status=406)

    try:
        data = get_data(request_type)
        if data == -1:
            return web.Response(
                text=
                "You sent an invalid type parameter. Valid types: feminino, masculino, acessorio, all",
                status=406)
    except Exception as e:
        print(str(e))
        return web.Response(text="Something went wrong. Please try later",
                            status=500)

    return web.json_response(body=data)

Beispiel #23

0

Datei anzeigen

Datei: test.py Projekt: Nithish2312/Traffic-Sign-Detection-Using-Capsule-Network

def test(dataset, ckpt):
    """
        Train the model
        **input: **
            *dataset: (String) Dataset folder to used
            *ckpt: (String) [Optional] Path to the ckpt file to restore
    """

    # Load name of id
    with open("signnames.csv", "r") as f:
        signnames = f.read()
    id_to_name = {
        int(line.split(",")[0]): line.split(",")[1]
        for line in signnames.split("\n")[1:] if len(line) > 0
    }

    # Get Test dataset
    _, _, _, _, X_test, y_test = get_data(dataset)
    X_test = X_test / 255

    model = ModelTrafficSign("TrafficSign", output_folder=None)
    # Load the model
    model.load(ckpt)

    # Evaluate all the dataset
    loss, acc, predicted_class = model.evaluate_dataset(X_test, y_test)

    print("Accuracy = ", acc)
    print("Loss = ", loss)

    # Get the confusion matrix
    cnf_matrix = confusion_matrix(y_test, predicted_class)

    # Plot the confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=[str(i) for i in range(43)],
                          title='Confusion matrix, without normalization')

    plt.show()

Beispiel #24

0

Datei anzeigen

Datei: attention_liwc_model.py Projekt: tuhinjubcse/Abusive_Tweet_Detection

def getAbusiveFeatures():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }
    f = open('abusive_dict.txt','r')
    m = {}
    for line in f:
        line = line.strip()
        m[line]=True
    tweets = get_data()
    X, y = [], []
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower()) # does it correct spelling as well?
        c = 0
        for word in text:
            if word in m:
                c = c+1
        X.append([c])
        y.append(y_map[tweet['label']])
    return np.array(X),np.array(y)

Beispiel #25

0

Datei anzeigen

Datei: attention_liwc_model.py Projekt: tuhinjubcse/Abusive_Tweet_Detection

def get_liwc_features_from_text():
    filenames = glob.glob("./LIWC_features/*.csv")
    print(filenames)
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }
    tweets = get_data()
    X, y = [], []
    # create a dict of lists of words in all liwc files
    features_dict = {}
    for file in filenames:
        f = open(file,'r')
        m = {}
        for line in f:
            line = line.strip()
            m[line]=True
        features_dict[file] = m
    
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        features = []
        for file in filenames:
            c = 1
            for word in text:
                if any([word.startswith(s) for s in features_dict[file]]):
                    c = c+1
            features.append(c)
        X.append(features)
        y.append(y_map[tweet['label']])

    # normalised results
    X = np.array(X)
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    return X, np.array(y)

Beispiel #26

0

Datei anzeigen

    learning_rate = 0.001
    batch_size = 64
    num_epochs = 100

    # Get pretrained Glove weights
    pretrained_glove = get_GloveEmbed(word_index, glove_path, vocab_size, embed_dim)

    # Get main target class weights based on training set
    pred_target_weights = np.genfromtxt(target_class_weights_path, delimiter=',')
    pred_target_weights = torch.tensor(pred_target_weights).type(torch.float)

    # Get adv target class weights based on training set
    adv_target_weights = np.genfromtxt(adv_target_class_weights_path, delimiter=',')

    # Get train, validation and test data
    train_set = get_data(train_path, word_index, seq_length, debias, batch_size, num_workers,
                         shuffle=True, pin_memory=pin_memory)

    val_set = get_data(val_path, word_index, seq_length, False, batch_size, num_workers,
                       shuffle=True, pin_memory=pin_memory)

    # Train loop
    for iter in range(len(grid)):
        print(f"=> Start Training of Model {iter}")

        writer = SummaryWriter(f'debias_inf_board/training/models/debias {iter}')

        # Initialize networks
        me = ME(vocab_size, embed_dim, pretrained_glove, train_embed, hme_hidden, dropout, device)
        ce = CE(ce_input, hce_hidden)
        predictor = Predictor((hce_hidden+hme_hidden), pred_hidden, pred_classes)
        adversary = Adversary(hme_hidden, adv_hidden, adv_classes)

Beispiel #27

0

Datei anzeigen

import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
import xgboost as xgb
import ast
import h5py
import pickle

### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
label_map = {'none': 0, 'racism': 1, 'sexism': 2}
tweet_data = get_data()
for tweet in tweet_data:
    texts.append(tweet['text'].lower())
    labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))

# logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf
model_count = 2
word_embed_size = 200
GLOVE_MODEL_FILE = "glove_embeddings/glove.twitter.27B.200d.txt"
EMBEDDING_DIM = 200
MODEL_TYPE = sys.argv[1]
print 'Embedding Dimension: %d' % (EMBEDDING_DIM)
print 'GloVe Embedding: %s' % (GLOVE_MODEL_FILE)

#Load model

Beispiel #28

0

Datei anzeigen

        task += '-' + init
    model_dir = models_dir + task
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    else:
        print 'already exist. exiting...'
        exit(-1)

    logger = get_logger(task, model_dir)

    logger.info(arguments)
    logger.info(task)
    home = expanduser("~")
    configure(tensorboard_dir + task)

    x_train, x_test = get_data(task_str, input_dir)

    np.random.shuffle(x_train)

    out_size = 2

    with open(input_vocab, 'r') as f:
        vocab = f.readlines()
        vocab = map(lambda s: s.strip(), vocab)
    vocab_size = len(vocab)
    adv_net = AdvNN(hid_size,
                    hid_size,
                    out_size,
                    hid_size,
                    adv_hid_size,
                    out_size,

Beispiel #29

0

Datei anzeigen

# for "yellow"  and "blue" extract the tokens
# put them into a tensor
# get the embeddings of those 2 words
# use np.inner to get the semantic simmilarity
# what about blue and car?

# can you come up with a different way of computing the semantic simmilarity?

import torch
import numpy as np
import data_handler as dh

model = torch.load('transformer_model1.pth', map_location=torch.device('cpu'))

_, _, _, vocab = dh.get_data()

# index (dtype = int) of the word (token) return from vocab

yellow = vocab['yellow']
blue = vocab['blue']
car = vocab['car']
print('Index of word yellow is: ', yellow)
print('Index of word blue is: ', blue)
print('Index of word car is: ', car)

# convert the index into tensor

yellow = torch.tensor(yellow)
blue = torch.tensor(blue)
car = torch.tensor(car)

Beispiel #30

0

Datei anzeigen

def train(dataset, ckpt=None, output=None):
    """
        Train the model
        **input: **
            *dataset: (String) Dataset folder to used
            *ckpt: (String) [Optional] Path to the ckpt file to restore
            *output: (String) [Optional] Path to the output folder to used. ./outputs/ by default
    """
    def preprocessing_function(img):
        """
            Custom preprocessing_function
        """
        img = img * 255
        img = Image.fromarray(img.astype('uint8'), 'RGB')
        img = ImageEnhance.Brightness(img).enhance(random.uniform(0.6, 1.5))
        img = ImageEnhance.Contrast(img).enhance(random.uniform(0.6, 1.5))

        return np.array(img) / 255

    X_train, y_train, X_valid, y_valid, X_test, y_test = get_data(dataset)

    X_train = X_train / 255
    X_valid = X_valid / 255
    X_test = X_test / 255

    train_datagen = ImageDataGenerator()
    train_datagen_augmented = ImageDataGenerator(
        rotation_range=20,
        shear_range=0.2,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True,
        preprocessing_function=preprocessing_function)
    inference_datagen = ImageDataGenerator()
    train_datagen.fit(X_train)
    train_datagen_augmented.fit(X_train)
    inference_datagen.fit(X_valid)
    inference_datagen.fit(X_test)

    # Utils method to print the current progression
    def plot_progression(b, cost, acc, label):
        print("[%s] Batch ID = %s, loss = %s, acc = %s" %
              (label, b, cost, acc))

    # Init model
    model = ModelTrafficSign("TrafficSign", output_folder=output)
    if ckpt is None:
        model.init()
    else:
        model.load(ckpt)

    # Training pipeline
    b = 0
    valid_batch = inference_datagen.flow(X_valid,
                                         y_valid,
                                         batch_size=BATCH_SIZE)
    best_validation_loss = None
    augmented_factor = 0.99
    decrease_factor = 0.80
    train_batches = train_datagen.flow(X_train, y_train, batch_size=BATCH_SIZE)
    augmented_train_batches = train_datagen_augmented.flow(
        X_train, y_train, batch_size=BATCH_SIZE)

    while True:
        next_batch = next(augmented_train_batches if random.
                          uniform(0, 1) < augmented_factor else train_batches)
        x_batch, y_batch = next_batch

        ### Training
        cost, acc = model.optimize(x_batch, y_batch)
        ### Validation
        x_batch, y_batch = next(valid_batch, None)
        # Retrieve the cost and acc on this validation batch and save it in tensorboard
        cost_val, acc_val = model.evaluate(x_batch, y_batch, tb_test_save=True)

        if b % 10 == 0:  # Plot the last results
            plot_progression(b, cost, acc, "Train")
            plot_progression(b, cost_val, acc_val, "Validation")
        if b % 1000 == 0:  # Test the model on all the validation
            print("Evaluate full validation dataset ...")
            loss, acc, _ = model.evaluate_dataset(X_valid, y_valid)
            print("Current loss: %s Best loss: %s" %
                  (loss, best_validation_loss))
            plot_progression(b, loss, acc, "TOTAL Validation")
            if best_validation_loss is None or loss < best_validation_loss:
                best_validation_loss = loss
                model.save()
            augmented_factor = augmented_factor * decrease_factor
            print("Augmented Factor = %s" % augmented_factor)

        b += 1