Exemple #1
0
def feature_zipf(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_zipf.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        author_zipf = {}
        tokenizer = RegexpTokenizer(r'\w+')
        stop_words = set(stopwords.words('english'))
        for author in tqdm(author_data):
            comments = comments = author_data[author]
            fd = FreqDist()
            for comment in comments:
                sentences = nltk.sent_tokenize(comment)
                for sentence in sentences:
                    sentence = sentence.lower()
                    tokens = tokenizer.tokenize(sentence)
                    filtered_sentence = [
                        w for w in tokens if not w in stop_words
                    ]
                    for word in filtered_sentence:
                        fd[word] += 1
            ranks = np.array([])
            freqs = np.array([])
            for rank, word in enumerate(fd):
                ranks = np.append(ranks, rank + 1)
                freqs = np.append(freqs, fd[word])
            slope = linefit_slope(np.log(ranks), np.log(freqs))
            author_zipf[author] = slope

        zipf_file = open(feature_filename, 'wb')
        pickle.dump(author_zipf, zipf_file)
        zipf_file.close()
    return feature_filename
Exemple #2
0
def feature_acronym(dataset_filename_pkl, acronyms_filename):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_acronym.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        acronym_count = 0
        character_count = 0
        tokenizer = RegexpTokenizer(r'\w+')
        author_acronym = {}
        acronyms = open(acronyms_filename).read().splitlines()
        for author in tqdm(author_data):
            comments = author_data[author]
            for comment in comments:
                sentences = nltk.sent_tokenize(comment)
                for sentence in sentences:
                    sentence = sentence.lower()
                    tokens = tokenizer.tokenize(sentence)
                    if "tl&dr" in sentence or "tl;dr" in sentence:
                        acronym_count += 1
                    for token in tokens:
                        if token in acronyms:
                            acronym_count += 1
                character_count += len(comment)
            acronym_rate = acronym_count / character_count
            author_acronym[author] = acronym_rate

        acronym_file = open(feature_filename, 'wb')
        pickle.dump(author_acronym, acronym_file)
        acronym_file.close()
    return feature_filename
Exemple #3
0
def main(cfg: DictConfig):
    logger = utils.get_logger()
    X_train, y_trains, X_test = utils.load_feature(cfg)
    y_pred = multi_target_training(cfg, X_train, y_trains, X_test, logger)
    logger.info("Make submission")
    make_submission(cfg, y_pred)
    logger.info("Finished Training and Prediction!")
Exemple #4
0
def generator(batch_size, df, labels, index_max, index_list, feature_name_list,
              feature_size_dict):
    while 1:
        indices = random.sample(index_list, batch_size)

        feature_batch_list = []
        for feature_name in feature_name_list:

            feature_size = feature_size_dict[feature_name]

            if len(feature_size) == 1:
                feature_batch = np.zeros((batch_size, feature_size[0]))
            elif len(feature_size) == 2:
                feature_batch = np.zeros(
                    (batch_size, feature_size[0], feature_size[1]))

            labels_batch = np.zeros((batch_size, 50))
            count = 0

            for batch_index in indices:

                df_dir = df.features_dir[batch_index]
                feature = load_feature(df_dir)

                feature_batch[count] = feature[feature_name]

                label = labels[batch_index]
                labels_batch[count] = label

                count += 1

            feature_batch_list.append(feature_batch)

        yield feature_batch_list, labels_batch
Exemple #5
0
 def load_feature(self):
     feature, dim = utils.load_feature(self.params.embedding_file)
     self.params.feature_dim = dim
     embedding = np.empty([self.params.num_node, dim])
     for name, vector in feature.items():
         embedding[self.name_to_id[name]] = vector
     return embedding
Exemple #6
0
def feature_to_cluster(feature_filenames_cluster, num_clusters):    
    num_authors_effective = len(utils.load_feature(feature_filenames_cluster[0]))
    points = np.ndarray((num_authors_effective, len(feature_filenames_cluster))) # rows for authors, columns for features
    for j in range(len(feature_filenames_cluster)):
        feature_data = utils.load_feature(feature_filenames_cluster[j])
        feature_list = list(feature_data.items())
        authors = list(feature_data.keys())
        feature_expon = []
        for i, val in enumerate(feature_list):
            #feature_expon.append((val[0], 10**(val[1]))) # features are converted to exponential scale for better resolution in distance
            feature_expon.append((val[0], val[1]))
        amin, amax = min(feature_expon,key=lambda tup: tup[1] ), max(feature_expon, key=lambda tup: tup[1])
        
        feature_normed = []
        for i, val in enumerate(feature_expon):
            if amax[1] == amin[1]: # prevents division by zero
                feature_normed.append((val[0], val[1]))
            else:
                feature_normed.append((val[0], (val[1]-amin[1]) / (amax[1]-amin[1])))
        for i in tqdm(range(len(authors))):
            points.itemset((i, j), feature_normed[i][1])
    points[np.isfinite(points) == False] = 1 # for safety
    points[np.isnan(points) == True] = 0 # for safety
    
    clusters = {}
    kmeans = KMeans(n_clusters = num_clusters[0])
    kmeans = kmeans.fit(points)
    labels = kmeans.predict(points)     
    for i in tqdm(range(len(authors))):
       clusters[authors[i]] = (labels[i],)
    
    for clusterSize in num_clusters[1:]:     
        kmeans = KMeans(n_clusters = clusterSize)
        kmeans = kmeans.fit(points)
        labels = kmeans.predict(points)     
        for i in tqdm(range(len(authors))):
           clusters[authors[i]] += (labels[i],)
           
    kmeans_file = open('kmeans_clusters.pkl', 'wb')
    pickle.dump(clusters, kmeans_file)
    kmeans_file.close()
    return clusters
def feature_to_graph(feature_file):
    graph_filename = "networks/" + os.path.basename(
        feature_file)[:-4] + "_graph.txt"
    graph_file = open(graph_filename, "w")
    feature = utils.load_feature(feature_file)
    feature_list = list(feature.items())
    feature_list.sort(key=lambda tup: tup[1])
    distance_adjustment = 0.01

    # Setting the log scale
    feature_logged = []
    # Get the smallest value after 0
    smallest = 0
    for val in feature_list:
        if val[1] > 0:
            smallest = val
            break
    if smallest == 0:  # It means that every author has 0 value
        return graph_filename  # Then return empty graph

    for i, val in enumerate(feature_list):
        if val[1] > 0:
            feature_logged.append((val[0], math.log10(val[1])))
        else:
            feature_logged.append((val[0], math.log10(smallest[1])))

    # Normalizing the feature list
    amin, amax = min(feature_logged,
                     key=lambda tup: tup[1]), max(feature_logged,
                                                  key=lambda tup: tup[1])
    feature_normed = []
    for i, val in enumerate(feature_logged):
        feature_normed.append(
            (val[0], (val[1] - amin[1]) / (amax[1] - amin[1])))

    std_deviation = statistics.stdev(list(map(lambda x: x[1], feature_normed)))

    for i in tqdm(range(len(feature_normed))):
        edges = []
        for j in range(i + 1, len(feature_normed)):
            distance = feature_normed[j][1] - feature_normed[i][1]
            if distance < std_deviation * distance_adjustment:  # Similartiy should be at least 1-standart_deviation
                similarity = 1 - distance
                edges.append((feature_normed[j][0], similarity))
            else:  # Otherwise it is considered as zero, as list is sorted, no need to look at the rest
                break
        # Write the edges to file
        for edge in edges:
            print(feature_normed[i][0] + " " + edge[0] + " " + str(edge[1]),
                  file=graph_file)
    return graph_filename
Exemple #8
0
def feature_profanity(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_profanity.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        author_profanity = {}
        for author in tqdm(author_data):
            single_text = ''.join(author_data[author])
            profanity_rate = predict_prob([single_text])
            author_profanity[author] = profanity_rate[0]

        profanity_file = open(feature_filename, 'wb')
        pickle.dump(author_profanity, profanity_file)
        profanity_file.close()
    return feature_filename
def overlapping_ngrams(ngram_feature_filename):
    author_ngrams = utils.load_feature(ngram_feature_filename)
    unigram_filename = "networks/" + os.path.basename(
        ngram_feature_filename)[:-4] + "_unigram_overlap_graph.txt"
    bigram_filename = "networks/" + os.path.basename(
        ngram_feature_filename)[:-4] + "_bigram_overlap_graph.txt"
    trigram_filename = "networks/" + os.path.basename(
        ngram_feature_filename)[:-4] + "_trigram_overlap_graph.txt"
    unigram_file = open(unigram_filename, "w")
    bigram_file = open(bigram_filename, "w")
    trigram_file = open(trigram_filename, "w")
    authors = list(author_ngrams.keys())
    for i in tqdm(range(len(authors))):
        source_ngrams = author_ngrams[authors[i]]
        for j in range(i + 1, len(authors)):
            target_ngrams = author_ngrams[authors[j]]
            # Unigrams
            if len(source_ngrams.unigrams) > 0:
                unigram_ratio = len(
                    source_ngrams.unigrams.intersection(
                        target_ngrams.unigrams)) / len(source_ngrams.unigrams)
                if unigram_ratio >= 0.15:  # Threshold
                    print(authors[i] + " " + authors[j] + " " +
                          str(unigram_ratio),
                          file=unigram_file)
            # Bigrams
            if len(source_ngrams.bigrams) > 0:
                bigram_ratio = len(
                    source_ngrams.bigrams.intersection(
                        target_ngrams.bigrams)) / len(source_ngrams.bigrams)
                if bigram_ratio >= 0.05:
                    print(authors[i] + " " + authors[j] + " " +
                          str(bigram_ratio),
                          file=bigram_file)
            # Trigrams
            if len(source_ngrams.trigrams) > 0:
                trigram_ratio = len(
                    source_ngrams.trigrams.intersection(
                        target_ngrams.trigrams)) / len(source_ngrams.trigrams)
                if trigram_ratio > 0.01:
                    print(authors[i] + " " + authors[j] + " " +
                          str(trigram_ratio),
                          file=trigram_file)
    unigram_file.close()
    bigram_file.close()
    trigram_file.close()
    return [unigram_filename, bigram_filename, trigram_filename]
Exemple #10
0
def feature_sentence_length(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_sentence_length.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        author_grammars = {}
        for author in tqdm(author_data):
            single_text = ''.join(author_data[author])
            sentences = nltk.sent_tokenize(single_text)
            if len(sentences) > 0:  # Just a precaution
                total_sentence_length = sum(map(lambda x: len(x), sentences))
                author_grammars[author] = total_sentence_length / len(
                    sentences)
        ngrams_file = open(feature_filename, 'wb')
        pickle.dump(author_grammars, ngrams_file)
        ngrams_file.close()
    return feature_filename
Exemple #11
0
def feature_grammar_check(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_grammar_check.pkl'
    if not os.path.isfile(feature_filename):
        tool = language_check.LanguageTool('en-US')
        author_data = utils.load_feature(dataset_filename_pkl)
        author_grammars = {}
        for author in tqdm(author_data):
            single_text = ''.join(author_data[author])
            sentences = nltk.sent_tokenize(single_text)
            if len(sentences) > 0:  # Just a precaution
                matches = tool.check(single_text)
                author_grammars[author] = len(matches) / len(sentences)

        ngrams_file = open(feature_filename, 'wb')
        pickle.dump(author_grammars, ngrams_file)
        ngrams_file.close()
    return feature_filename
Exemple #12
0
def calcurate_roc_auc(df, num_test, num_segment, feature_name_list,
                      feature_size_dict, model):
    y_pred = np.zeros((num_test, 50))
    y_true = np.zeros((num_test, 50))
    count = 0
    for index in df.index:
        split = df.split[index]
        clip_id = df.clip_id[index]
        if split == 'test':
            print('index', index)
            predict_label_sum = np.zeros(50)
            for j in range(1, 11):
                feature_dir = 'features_norm/feature_%s_%d_%d.pickle' % (
                    split, clip_id, j)
                features = load_feature(feature_dir)

                feature_list = []
                for feature_name in feature_name_list:
                    feature_size = feature_size_dict[feature_name]
                    feature = features[feature_name]
                    if len(feature_size) == 1:
                        feature = feature.reshape(1, feature_size[0])
                    if len(feature_size) == 2:
                        feature = feature.reshape(1, feature_size[0],
                                                  feature_size[1])
                    feature_list.append(feature)

                predict_label = model.predict(feature_list).reshape(50)

                predict_label_sum += predict_label

            predict_label_song = predict_label_sum / num_segment
            true_label_song = df.iloc[:,
                                      1:51][index:index + 1].values.reshape(50)

            y_pred[count] = predict_label_song
            y_true[count] = true_label_song

            count += 1

    roc_auc = roc_auc_score(y_true, y_pred, average='macro')
    return roc_auc
Exemple #13
0
def feature_emoji(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_emoji.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        author_emoji = {}
        for author in tqdm(author_data):
            comments = author_data[author]
            emoji_count = 0
            character_count = 0
            for comment in comments:
                emoji_count += len(
                    re.findall(r'(?::|;|=|x)(?:-)?(?:\)|\(D|P|S)', comment))
                character_count += len(comment)
            emoji_rate = emoji_count / character_count
            author_emoji[author] = emoji_rate

        emoji_file = open(feature_filename, 'wb')
        pickle.dump(author_emoji, emoji_file)
        emoji_file.close()
    return feature_filename
    def __getitem__(self, index):
        vid, duration, timestamps, sentence, words, id2pos, adj_mat = self.data[index]
        feats = load_feature(os.path.join(self.feature_path, 'tall_c3d_features.hdf5'), vid=vid, dataset='TACOS')
        fps = feats.shape[0] / duration
        adj_mat = np.asarray(adj_mat)
        start_frame = int(fps * timestamps[0])
        end_frame = int(fps * timestamps[1])
        if end_frame >= feats.shape[0]:
            end_frame = feats.shape[0] - 1
        if start_frame > end_frame:
            start_frame = end_frame
        assert start_frame <= end_frame
        assert 0 <= start_frame < feats.shape[0]
        assert 0 <= end_frame < feats.shape[0]
        label = np.asarray([start_frame, end_frame]).astype(np.int32)

        words_vec = np.asarray([self.word2vec[word] if word in self.word2vec.vocab else np.zeros(300).astype(np.float32) for word in words])
        words_vec = words_vec.astype(np.float32)

        id2pos = np.asarray(id2pos).astype(np.int64)
        return feats, words_vec, label, id2pos, adj_mat.astype(np.int32)
    def __getitem__(self, index):
        vid, duration, timestamps, sentence = self.data[index]
        feats = load_feature(os.path.join(self.feature_path, '%s.npy' % vid[:-4]), dataset='TACOS')
        fps = feats.shape[0] / duration

        start_frame = int(fps * timestamps[0])
        end_frame = int(fps * timestamps[1])
        if end_frame >= feats.shape[0]:
            end_frame = feats.shape[0] - 1
        if start_frame > end_frame:
            start_frame = end_frame
        assert start_frame <= end_frame
        assert 0 <= start_frame < feats.shape[0]
        assert 0 <= end_frame < feats.shape[0]
        label = np.asarray([start_frame, end_frame]).astype(np.int32)

        words = tokenize(sentence, self.word2vec)
        words_vec = np.asarray([self.word2vec[word] for word in words])
        words_vec = words_vec.astype(np.float32)

        return feats, words_vec, label
Exemple #16
0
def feature_ngrams(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_ngrams.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        tokenizer = RegexpTokenizer(r'\w+')
        stop_words = set(stopwords.words('english'))
        author_ngrams = {}
        for author in tqdm(author_data):
            comments = author_data[author]
            unigrams_set = set()
            bigrams_set = set()
            trigrams_set = set()
            for comment in comments:
                sentences = nltk.sent_tokenize(comment)
                for sentence in sentences:
                    sentence = sentence.lower()
                    tokens = tokenizer.tokenize(sentence)
                    filtered_sentence = [
                        w for w in tokens if not w in stop_words
                    ]
                    bigrams = ngrams(filtered_sentence, 2)
                    trigrams = ngrams(filtered_sentence, 3)
                    for unigram in filtered_sentence:  #Tokens are alredy unigram
                        unigrams_set.add(unigram)
                    for bigram in bigrams:
                        bigrams_set.add(bigram)
                    for trigram in trigrams:
                        trigrams_set.add(trigram)
            # Now we have a set of ngrams of each author
            author_ngram_sets = NgramSets()
            author_ngram_sets.unigrams = unigrams_set
            author_ngram_sets.bigrams = bigrams_set
            author_ngram_sets.trigrams = trigrams_set
            author_ngrams[author] = author_ngram_sets

        ngrams_file = open(feature_filename, 'wb')
        pickle.dump(author_ngrams, ngrams_file)
        ngrams_file.close()
    return feature_filename
Exemple #17
0
def feature_uppercase(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_uppercase.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        author_uppercase = {}
        for author in tqdm(author_data):
            comments = author_data[author]
            uppercase_count = 0
            character_count = 0
            for comment in comments:
                for character in comment:
                    if (character.isupper()):
                        uppercase_count += 1
                    character_count += len(comment)
            uppercase_rate = uppercase_count / character_count
            author_uppercase[author] = uppercase_rate

        uppercase_file = open(feature_filename, 'wb')
        pickle.dump(author_uppercase, uppercase_file)
        uppercase_file.close()
    return feature_filename
Exemple #18
0
def feature_punct(dataset_filename_pkl):
    feature_filename = 'features/' + os.path.basename(
        dataset_filename_pkl)[:-4] + '_feature_punct.pkl'
    if not os.path.isfile(feature_filename):
        author_data = utils.load_feature(dataset_filename_pkl)
        author_punct = {}
        for author in tqdm(author_data):
            comments = author_data[author]
            punct_count = 0
            character_count = 0
            for comment in comments:
                for character in comment:
                    if character in string.punctuation:
                        punct_count += 1
                    character_count += len(comment)
            punct_rate = punct_count / character_count
            author_punct[author] = punct_rate

        punct_file = open(feature_filename, 'wb')
        pickle.dump(author_punct, punct_file)
        punct_file.close()
    return feature_filename
Exemple #19
0
@File: 02_xgb_cv_poly.py 
@Time: 2018/10/24 18:13
@Software: PyCharm 
@Description:
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from utils import load_feature

df_train, df_test, label = load_feature()
scaler = StandardScaler()
X = df_train.drop(['日期'], axis=1, inplace=False)
X = scaler.fit_transform(X)
y = label.values

# 预测结果的数据
sub_x = df_test.drop(['日期'], axis=1, inplace=False)
sub_x = scaler.fit_transform(sub_x)

kf = KFold(n_splits=5, random_state=123, shuffle=True)
clf = XGBRegressor(objective='reg:linear',
                   n_estimators=1000,
                   min_child_weight=1,
                   learning_rate=0.01,
                   max_depth=5,
Exemple #20
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    pcanet = PCANet(args.stages, args.filter_shape, args.stages_channels,
                    args.block_size, args.block_overlap)
    train_queue, valid_queue = load_train_mnist(args)  # load dataset
    logging.info("load training dataset completely")
    total_train_labels = torch.tensor([]).long()

    writer = SummaryWriter(args.save)  # tensorboardX

    # extract feature from images
    with torch.no_grad():
        # first of all, generate eigenvector, and then execute convolution
        stage_save_path = args.save
        save_filename = utils.create_pickle_file_name(stage_save_path, 0)
        for global_step, (train_images,
                          train_labels) in enumerate(train_queue):
            train_images = train_images.cuda()
            total_train_labels = torch.cat((total_train_labels, train_labels))
            utils.save_feature([train_images, train_labels], save_filename)
            pcanet.unrolled_stage(train_images, 0)

            if global_step % args.log_freq == 0:
                logging.info("init training global_step: %d" % global_step)
                # convert a batch of tensor into CHW format
                grid_images = make_grid(train_images,
                                        nrow=16,
                                        padding=5,
                                        pad_value=125)
                writer.add_image("raw_images_in_step_%d" % global_step,
                                 grid_images)

        total_features = torch.tensor([])  # empty tensor
        for stage in range(args.stages):
            logging.info('PCANet stage: %d' % stage)

            # transform eigenvector to convolution kernel
            kernel = pcanet.eigenvector_to_kernel(stage)

            load_filename = utils.create_pickle_file_name(
                stage_save_path, stage)
            if stage + 1 < args.stages:
                save_filename = utils.create_pickle_file_name(
                    stage_save_path, stage + 1)

            load_filename_pointer = 0  # clear file object pointer
            for step in range(global_step + 1):
                train_images, train_labels, load_filename_pointer = \
                    utils.load_feature(load_filename, load_filename_pointer)
                batch_features = pcanet.pca_conv(train_images, kernel)
                if step % args.log_freq == 0:
                    # view the i-th image's feature map in a single batch
                    single_image_feature = utils.exchange_channel(
                        batch_features[5])
                    grid_images = make_grid(single_image_feature,
                                            nrow=8,
                                            padding=5,
                                            pad_value=125)
                    writer.add_image(
                        "feature_image_in_step_%d_in_stage_%d" % (step, stage),
                        grid_images)

                if stage + 1 < args.stages:
                    utils.save_feature([batch_features, train_labels],
                                       save_filename)
                    pcanet.unrolled_stage(batch_features, stage + 1)
                else:
                    decimal_features = pcanet.binary_mapping(
                        batch_features, stage)
                    final_features = pcanet.generate_histogram(
                        decimal_features)
                    final_features = final_features.cpu()
                    total_features = torch.cat(
                        (total_features, final_features), dim=0)

                if step % args.log_freq == 0:
                    logging.info("circulate training step: %d" % step)

            grid_kernels = make_grid(pcanet.kernel[stage],
                                     nrow=args.stages_channels[stage],
                                     padding=5,
                                     pad_value=125)
            writer.add_image("kernel_in_stage_%d" % stage, grid_kernels)

        writer.close()
        logging.info('extract feature completely, start training classifier')

        # train classifier
        classifier = LinearSVC()
        # classifier = SVC()
        # total_features = total_features.cpu()
        classifier.fit(total_features, total_train_labels)
        logging.info('classifier trained completely')

        # save model
        utils.save_model(pcanet, stage_save_path + "/pcanet.pkl")
        utils.save_model(classifier, stage_save_path + "/classifier.pkl")

        train_score = classifier.score(total_features, total_train_labels)
        logging.info("score of training is %s" % train_score)