def feature_zipf(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_zipf.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) author_zipf = {} tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) for author in tqdm(author_data): comments = comments = author_data[author] fd = FreqDist() for comment in comments: sentences = nltk.sent_tokenize(comment) for sentence in sentences: sentence = sentence.lower() tokens = tokenizer.tokenize(sentence) filtered_sentence = [ w for w in tokens if not w in stop_words ] for word in filtered_sentence: fd[word] += 1 ranks = np.array([]) freqs = np.array([]) for rank, word in enumerate(fd): ranks = np.append(ranks, rank + 1) freqs = np.append(freqs, fd[word]) slope = linefit_slope(np.log(ranks), np.log(freqs)) author_zipf[author] = slope zipf_file = open(feature_filename, 'wb') pickle.dump(author_zipf, zipf_file) zipf_file.close() return feature_filename
def feature_acronym(dataset_filename_pkl, acronyms_filename): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_acronym.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) acronym_count = 0 character_count = 0 tokenizer = RegexpTokenizer(r'\w+') author_acronym = {} acronyms = open(acronyms_filename).read().splitlines() for author in tqdm(author_data): comments = author_data[author] for comment in comments: sentences = nltk.sent_tokenize(comment) for sentence in sentences: sentence = sentence.lower() tokens = tokenizer.tokenize(sentence) if "tl&dr" in sentence or "tl;dr" in sentence: acronym_count += 1 for token in tokens: if token in acronyms: acronym_count += 1 character_count += len(comment) acronym_rate = acronym_count / character_count author_acronym[author] = acronym_rate acronym_file = open(feature_filename, 'wb') pickle.dump(author_acronym, acronym_file) acronym_file.close() return feature_filename
def main(cfg: DictConfig): logger = utils.get_logger() X_train, y_trains, X_test = utils.load_feature(cfg) y_pred = multi_target_training(cfg, X_train, y_trains, X_test, logger) logger.info("Make submission") make_submission(cfg, y_pred) logger.info("Finished Training and Prediction!")
def generator(batch_size, df, labels, index_max, index_list, feature_name_list, feature_size_dict): while 1: indices = random.sample(index_list, batch_size) feature_batch_list = [] for feature_name in feature_name_list: feature_size = feature_size_dict[feature_name] if len(feature_size) == 1: feature_batch = np.zeros((batch_size, feature_size[0])) elif len(feature_size) == 2: feature_batch = np.zeros( (batch_size, feature_size[0], feature_size[1])) labels_batch = np.zeros((batch_size, 50)) count = 0 for batch_index in indices: df_dir = df.features_dir[batch_index] feature = load_feature(df_dir) feature_batch[count] = feature[feature_name] label = labels[batch_index] labels_batch[count] = label count += 1 feature_batch_list.append(feature_batch) yield feature_batch_list, labels_batch
def load_feature(self): feature, dim = utils.load_feature(self.params.embedding_file) self.params.feature_dim = dim embedding = np.empty([self.params.num_node, dim]) for name, vector in feature.items(): embedding[self.name_to_id[name]] = vector return embedding
def feature_to_cluster(feature_filenames_cluster, num_clusters): num_authors_effective = len(utils.load_feature(feature_filenames_cluster[0])) points = np.ndarray((num_authors_effective, len(feature_filenames_cluster))) # rows for authors, columns for features for j in range(len(feature_filenames_cluster)): feature_data = utils.load_feature(feature_filenames_cluster[j]) feature_list = list(feature_data.items()) authors = list(feature_data.keys()) feature_expon = [] for i, val in enumerate(feature_list): #feature_expon.append((val[0], 10**(val[1]))) # features are converted to exponential scale for better resolution in distance feature_expon.append((val[0], val[1])) amin, amax = min(feature_expon,key=lambda tup: tup[1] ), max(feature_expon, key=lambda tup: tup[1]) feature_normed = [] for i, val in enumerate(feature_expon): if amax[1] == amin[1]: # prevents division by zero feature_normed.append((val[0], val[1])) else: feature_normed.append((val[0], (val[1]-amin[1]) / (amax[1]-amin[1]))) for i in tqdm(range(len(authors))): points.itemset((i, j), feature_normed[i][1]) points[np.isfinite(points) == False] = 1 # for safety points[np.isnan(points) == True] = 0 # for safety clusters = {} kmeans = KMeans(n_clusters = num_clusters[0]) kmeans = kmeans.fit(points) labels = kmeans.predict(points) for i in tqdm(range(len(authors))): clusters[authors[i]] = (labels[i],) for clusterSize in num_clusters[1:]: kmeans = KMeans(n_clusters = clusterSize) kmeans = kmeans.fit(points) labels = kmeans.predict(points) for i in tqdm(range(len(authors))): clusters[authors[i]] += (labels[i],) kmeans_file = open('kmeans_clusters.pkl', 'wb') pickle.dump(clusters, kmeans_file) kmeans_file.close() return clusters
def feature_to_graph(feature_file): graph_filename = "networks/" + os.path.basename( feature_file)[:-4] + "_graph.txt" graph_file = open(graph_filename, "w") feature = utils.load_feature(feature_file) feature_list = list(feature.items()) feature_list.sort(key=lambda tup: tup[1]) distance_adjustment = 0.01 # Setting the log scale feature_logged = [] # Get the smallest value after 0 smallest = 0 for val in feature_list: if val[1] > 0: smallest = val break if smallest == 0: # It means that every author has 0 value return graph_filename # Then return empty graph for i, val in enumerate(feature_list): if val[1] > 0: feature_logged.append((val[0], math.log10(val[1]))) else: feature_logged.append((val[0], math.log10(smallest[1]))) # Normalizing the feature list amin, amax = min(feature_logged, key=lambda tup: tup[1]), max(feature_logged, key=lambda tup: tup[1]) feature_normed = [] for i, val in enumerate(feature_logged): feature_normed.append( (val[0], (val[1] - amin[1]) / (amax[1] - amin[1]))) std_deviation = statistics.stdev(list(map(lambda x: x[1], feature_normed))) for i in tqdm(range(len(feature_normed))): edges = [] for j in range(i + 1, len(feature_normed)): distance = feature_normed[j][1] - feature_normed[i][1] if distance < std_deviation * distance_adjustment: # Similartiy should be at least 1-standart_deviation similarity = 1 - distance edges.append((feature_normed[j][0], similarity)) else: # Otherwise it is considered as zero, as list is sorted, no need to look at the rest break # Write the edges to file for edge in edges: print(feature_normed[i][0] + " " + edge[0] + " " + str(edge[1]), file=graph_file) return graph_filename
def feature_profanity(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_profanity.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) author_profanity = {} for author in tqdm(author_data): single_text = ''.join(author_data[author]) profanity_rate = predict_prob([single_text]) author_profanity[author] = profanity_rate[0] profanity_file = open(feature_filename, 'wb') pickle.dump(author_profanity, profanity_file) profanity_file.close() return feature_filename
def overlapping_ngrams(ngram_feature_filename): author_ngrams = utils.load_feature(ngram_feature_filename) unigram_filename = "networks/" + os.path.basename( ngram_feature_filename)[:-4] + "_unigram_overlap_graph.txt" bigram_filename = "networks/" + os.path.basename( ngram_feature_filename)[:-4] + "_bigram_overlap_graph.txt" trigram_filename = "networks/" + os.path.basename( ngram_feature_filename)[:-4] + "_trigram_overlap_graph.txt" unigram_file = open(unigram_filename, "w") bigram_file = open(bigram_filename, "w") trigram_file = open(trigram_filename, "w") authors = list(author_ngrams.keys()) for i in tqdm(range(len(authors))): source_ngrams = author_ngrams[authors[i]] for j in range(i + 1, len(authors)): target_ngrams = author_ngrams[authors[j]] # Unigrams if len(source_ngrams.unigrams) > 0: unigram_ratio = len( source_ngrams.unigrams.intersection( target_ngrams.unigrams)) / len(source_ngrams.unigrams) if unigram_ratio >= 0.15: # Threshold print(authors[i] + " " + authors[j] + " " + str(unigram_ratio), file=unigram_file) # Bigrams if len(source_ngrams.bigrams) > 0: bigram_ratio = len( source_ngrams.bigrams.intersection( target_ngrams.bigrams)) / len(source_ngrams.bigrams) if bigram_ratio >= 0.05: print(authors[i] + " " + authors[j] + " " + str(bigram_ratio), file=bigram_file) # Trigrams if len(source_ngrams.trigrams) > 0: trigram_ratio = len( source_ngrams.trigrams.intersection( target_ngrams.trigrams)) / len(source_ngrams.trigrams) if trigram_ratio > 0.01: print(authors[i] + " " + authors[j] + " " + str(trigram_ratio), file=trigram_file) unigram_file.close() bigram_file.close() trigram_file.close() return [unigram_filename, bigram_filename, trigram_filename]
def feature_sentence_length(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_sentence_length.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) author_grammars = {} for author in tqdm(author_data): single_text = ''.join(author_data[author]) sentences = nltk.sent_tokenize(single_text) if len(sentences) > 0: # Just a precaution total_sentence_length = sum(map(lambda x: len(x), sentences)) author_grammars[author] = total_sentence_length / len( sentences) ngrams_file = open(feature_filename, 'wb') pickle.dump(author_grammars, ngrams_file) ngrams_file.close() return feature_filename
def feature_grammar_check(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_grammar_check.pkl' if not os.path.isfile(feature_filename): tool = language_check.LanguageTool('en-US') author_data = utils.load_feature(dataset_filename_pkl) author_grammars = {} for author in tqdm(author_data): single_text = ''.join(author_data[author]) sentences = nltk.sent_tokenize(single_text) if len(sentences) > 0: # Just a precaution matches = tool.check(single_text) author_grammars[author] = len(matches) / len(sentences) ngrams_file = open(feature_filename, 'wb') pickle.dump(author_grammars, ngrams_file) ngrams_file.close() return feature_filename
def calcurate_roc_auc(df, num_test, num_segment, feature_name_list, feature_size_dict, model): y_pred = np.zeros((num_test, 50)) y_true = np.zeros((num_test, 50)) count = 0 for index in df.index: split = df.split[index] clip_id = df.clip_id[index] if split == 'test': print('index', index) predict_label_sum = np.zeros(50) for j in range(1, 11): feature_dir = 'features_norm/feature_%s_%d_%d.pickle' % ( split, clip_id, j) features = load_feature(feature_dir) feature_list = [] for feature_name in feature_name_list: feature_size = feature_size_dict[feature_name] feature = features[feature_name] if len(feature_size) == 1: feature = feature.reshape(1, feature_size[0]) if len(feature_size) == 2: feature = feature.reshape(1, feature_size[0], feature_size[1]) feature_list.append(feature) predict_label = model.predict(feature_list).reshape(50) predict_label_sum += predict_label predict_label_song = predict_label_sum / num_segment true_label_song = df.iloc[:, 1:51][index:index + 1].values.reshape(50) y_pred[count] = predict_label_song y_true[count] = true_label_song count += 1 roc_auc = roc_auc_score(y_true, y_pred, average='macro') return roc_auc
def feature_emoji(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_emoji.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) author_emoji = {} for author in tqdm(author_data): comments = author_data[author] emoji_count = 0 character_count = 0 for comment in comments: emoji_count += len( re.findall(r'(?::|;|=|x)(?:-)?(?:\)|\(D|P|S)', comment)) character_count += len(comment) emoji_rate = emoji_count / character_count author_emoji[author] = emoji_rate emoji_file = open(feature_filename, 'wb') pickle.dump(author_emoji, emoji_file) emoji_file.close() return feature_filename
def __getitem__(self, index): vid, duration, timestamps, sentence, words, id2pos, adj_mat = self.data[index] feats = load_feature(os.path.join(self.feature_path, 'tall_c3d_features.hdf5'), vid=vid, dataset='TACOS') fps = feats.shape[0] / duration adj_mat = np.asarray(adj_mat) start_frame = int(fps * timestamps[0]) end_frame = int(fps * timestamps[1]) if end_frame >= feats.shape[0]: end_frame = feats.shape[0] - 1 if start_frame > end_frame: start_frame = end_frame assert start_frame <= end_frame assert 0 <= start_frame < feats.shape[0] assert 0 <= end_frame < feats.shape[0] label = np.asarray([start_frame, end_frame]).astype(np.int32) words_vec = np.asarray([self.word2vec[word] if word in self.word2vec.vocab else np.zeros(300).astype(np.float32) for word in words]) words_vec = words_vec.astype(np.float32) id2pos = np.asarray(id2pos).astype(np.int64) return feats, words_vec, label, id2pos, adj_mat.astype(np.int32)
def __getitem__(self, index): vid, duration, timestamps, sentence = self.data[index] feats = load_feature(os.path.join(self.feature_path, '%s.npy' % vid[:-4]), dataset='TACOS') fps = feats.shape[0] / duration start_frame = int(fps * timestamps[0]) end_frame = int(fps * timestamps[1]) if end_frame >= feats.shape[0]: end_frame = feats.shape[0] - 1 if start_frame > end_frame: start_frame = end_frame assert start_frame <= end_frame assert 0 <= start_frame < feats.shape[0] assert 0 <= end_frame < feats.shape[0] label = np.asarray([start_frame, end_frame]).astype(np.int32) words = tokenize(sentence, self.word2vec) words_vec = np.asarray([self.word2vec[word] for word in words]) words_vec = words_vec.astype(np.float32) return feats, words_vec, label
def feature_ngrams(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_ngrams.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) author_ngrams = {} for author in tqdm(author_data): comments = author_data[author] unigrams_set = set() bigrams_set = set() trigrams_set = set() for comment in comments: sentences = nltk.sent_tokenize(comment) for sentence in sentences: sentence = sentence.lower() tokens = tokenizer.tokenize(sentence) filtered_sentence = [ w for w in tokens if not w in stop_words ] bigrams = ngrams(filtered_sentence, 2) trigrams = ngrams(filtered_sentence, 3) for unigram in filtered_sentence: #Tokens are alredy unigram unigrams_set.add(unigram) for bigram in bigrams: bigrams_set.add(bigram) for trigram in trigrams: trigrams_set.add(trigram) # Now we have a set of ngrams of each author author_ngram_sets = NgramSets() author_ngram_sets.unigrams = unigrams_set author_ngram_sets.bigrams = bigrams_set author_ngram_sets.trigrams = trigrams_set author_ngrams[author] = author_ngram_sets ngrams_file = open(feature_filename, 'wb') pickle.dump(author_ngrams, ngrams_file) ngrams_file.close() return feature_filename
def feature_uppercase(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_uppercase.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) author_uppercase = {} for author in tqdm(author_data): comments = author_data[author] uppercase_count = 0 character_count = 0 for comment in comments: for character in comment: if (character.isupper()): uppercase_count += 1 character_count += len(comment) uppercase_rate = uppercase_count / character_count author_uppercase[author] = uppercase_rate uppercase_file = open(feature_filename, 'wb') pickle.dump(author_uppercase, uppercase_file) uppercase_file.close() return feature_filename
def feature_punct(dataset_filename_pkl): feature_filename = 'features/' + os.path.basename( dataset_filename_pkl)[:-4] + '_feature_punct.pkl' if not os.path.isfile(feature_filename): author_data = utils.load_feature(dataset_filename_pkl) author_punct = {} for author in tqdm(author_data): comments = author_data[author] punct_count = 0 character_count = 0 for comment in comments: for character in comment: if character in string.punctuation: punct_count += 1 character_count += len(comment) punct_rate = punct_count / character_count author_punct[author] = punct_rate punct_file = open(feature_filename, 'wb') pickle.dump(author_punct, punct_file) punct_file.close() return feature_filename
@File: 02_xgb_cv_poly.py @Time: 2018/10/24 18:13 @Software: PyCharm @Description: """ import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold, StratifiedKFold from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from xgboost import XGBRegressor from sklearn.metrics import mean_absolute_error from utils import load_feature df_train, df_test, label = load_feature() scaler = StandardScaler() X = df_train.drop(['日期'], axis=1, inplace=False) X = scaler.fit_transform(X) y = label.values # 预测结果的数据 sub_x = df_test.drop(['日期'], axis=1, inplace=False) sub_x = scaler.fit_transform(sub_x) kf = KFold(n_splits=5, random_state=123, shuffle=True) clf = XGBRegressor(objective='reg:linear', n_estimators=1000, min_child_weight=1, learning_rate=0.01, max_depth=5,
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) pcanet = PCANet(args.stages, args.filter_shape, args.stages_channels, args.block_size, args.block_overlap) train_queue, valid_queue = load_train_mnist(args) # load dataset logging.info("load training dataset completely") total_train_labels = torch.tensor([]).long() writer = SummaryWriter(args.save) # tensorboardX # extract feature from images with torch.no_grad(): # first of all, generate eigenvector, and then execute convolution stage_save_path = args.save save_filename = utils.create_pickle_file_name(stage_save_path, 0) for global_step, (train_images, train_labels) in enumerate(train_queue): train_images = train_images.cuda() total_train_labels = torch.cat((total_train_labels, train_labels)) utils.save_feature([train_images, train_labels], save_filename) pcanet.unrolled_stage(train_images, 0) if global_step % args.log_freq == 0: logging.info("init training global_step: %d" % global_step) # convert a batch of tensor into CHW format grid_images = make_grid(train_images, nrow=16, padding=5, pad_value=125) writer.add_image("raw_images_in_step_%d" % global_step, grid_images) total_features = torch.tensor([]) # empty tensor for stage in range(args.stages): logging.info('PCANet stage: %d' % stage) # transform eigenvector to convolution kernel kernel = pcanet.eigenvector_to_kernel(stage) load_filename = utils.create_pickle_file_name( stage_save_path, stage) if stage + 1 < args.stages: save_filename = utils.create_pickle_file_name( stage_save_path, stage + 1) load_filename_pointer = 0 # clear file object pointer for step in range(global_step + 1): train_images, train_labels, load_filename_pointer = \ utils.load_feature(load_filename, load_filename_pointer) batch_features = pcanet.pca_conv(train_images, kernel) if step % args.log_freq == 0: # view the i-th image's feature map in a single batch single_image_feature = utils.exchange_channel( batch_features[5]) grid_images = make_grid(single_image_feature, nrow=8, padding=5, pad_value=125) writer.add_image( "feature_image_in_step_%d_in_stage_%d" % (step, stage), grid_images) if stage + 1 < args.stages: utils.save_feature([batch_features, train_labels], save_filename) pcanet.unrolled_stage(batch_features, stage + 1) else: decimal_features = pcanet.binary_mapping( batch_features, stage) final_features = pcanet.generate_histogram( decimal_features) final_features = final_features.cpu() total_features = torch.cat( (total_features, final_features), dim=0) if step % args.log_freq == 0: logging.info("circulate training step: %d" % step) grid_kernels = make_grid(pcanet.kernel[stage], nrow=args.stages_channels[stage], padding=5, pad_value=125) writer.add_image("kernel_in_stage_%d" % stage, grid_kernels) writer.close() logging.info('extract feature completely, start training classifier') # train classifier classifier = LinearSVC() # classifier = SVC() # total_features = total_features.cpu() classifier.fit(total_features, total_train_labels) logging.info('classifier trained completely') # save model utils.save_model(pcanet, stage_save_path + "/pcanet.pkl") utils.save_model(classifier, stage_save_path + "/classifier.pkl") train_score = classifier.score(total_features, total_train_labels) logging.info("score of training is %s" % train_score)