def main(): print("Reading csv files") dataset = read_all_DS() trainset = pd.read_csv('dataRev2/Train.csv') train_confirmed = trainset[['AuthorId', 'ConfirmedPaperIds']].rename(columns = {'ConfirmedPaperIds':'PaperIds'}) train_deleted = trainset[['AuthorId', 'DeletedPaperIds']].rename(columns = {'DeletedPaperIds':'PaperIds'}) validset = pd.read_csv('dataRev2/Valid.csv') testset = pd.read_csv('dataRev2/Test.csv') allsets = pd.concat([train_confirmed, validset, testset]) all_dups = make_duplicates_from_targets(allsets) dataset['all_duplicates'] = all_dups print("Getting features for confirmed papers") features_conf = get_features(dataset, train_confirmed) print("Getting features for deleted papers") features_deleted = get_features(dataset, train_deleted) print("Getting features for valid papers") features_valid = get_features(dataset, validset) pickle.dump(features_deleted, open(data_io.get_paths()["deleted_features"], 'wb')) pickle.dump(features_conf, open(data_io.get_paths()["confirmed_features"], 'wb')) pickle.dump(features_valid, open(data_io.get_paths()["valid_features"], 'wb')) print("Getting features for test papers") features_test = get_features(dataset, testset) pickle.dump(features_test, open(data_io.get_paths()["test_features"], 'wb'))
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000): sorted_samples = samples.sorted_values(by=["srch_id"]) sorted_samples = sorted_samples.reset_index(drop=True) samples_in_one_srch = pd.DataFrame() for r_idx, sample in sorted_samples.iterrows(): if (r_idx + 1) % 1000 == 0: print "Processed %i sample of %i" % (r_idx + 1, sorted_samples.shape[0]) is_next_in_same_search = True samples_in_one_srch = pd.concat( (sample.to_frame().transpose(), samples_in_one_srch), axis=0) current_srch_id = sample["srch_id"] if (r_idx + 1) == sorted_samples.shape[0]: is_next_in_same_search == False else: next_srch_id = sorted_samples["srch_id"][r_idx + 1] if current_srch_id != next_srch_id: is_next_in_same_search = False if not is_next_in_same_search: ext_samples_in_one_srch = extract_features(samples_in_one_srch) n_samples = ext_samples_in_one_srch.shape[0] if n_samples > max_srch_size: if np.any(ext_samples_in_one_srch["bookings_bool"]): pos_samples = ext_samples_in_one_srch[ ext_samples_in_one_srch["booking_bool"] == 1] neg_samples = ext_samples_in_one_srch[ ext_samples_in_one_srch["booking_bool"] == 0] selected_neg_samples = neg_samples.samples( n=max_srch_size - pos_samples.shape[0]) selected_samples = pd.concat( (pos_samples, selected_neg_samples), axis=0) else: selected_samples = ext_samples_in_one_srch.sample( n=max_srch_size) else: selected_samples = ext_samples_in_one_srch.copy() processed_samples = pd.concat( (processed_samples, selected_samples), axis=0) samples_in_one_srch = pd.DataFrame() if (r_idx + 1) % each_saved_size == 0: save_file_name = "proc_train_samples_%i.csv" % (r_idx + 1) save_path = get_paths()["proc_train_path"] if not os.path.exists(save_path): os.makedirs(save_path) if np.any(np.isnan(processd_samples.values)): processd_samples = processd_samples.fillna(value=0) processd_samples.to_csv(os.path.join(save_path, save_file_name), index=None) save_file_name = "proc_train_samples%i.csv" % (r_idx + 1) save_path = get_paths()["proc_train_path"] if np.any(np.isnan(processd_samples.values)): processd_samples = processd_samples.fillna(value=0) processd_samples.to_csv(os.path.join(save_path, save_file_name), index=None)
def parse_papers(self): # Create Papers print "Parsing Papers..." f = open(data_io.get_paths()["paper_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") paper_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) title_words = nlp.filter_paper_title(paper_title) paper_keyword = unidecode.unidecode( unicode(res[5], encoding="utf-8")) filtered_keyword = nlp.filter_paper_keyword(paper_keyword) self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words, int(res[2]), int(res[3]), int(res[4]), filtered_keyword) for tt in title_words.split(): try: self.paper_titles[tt] = self.paper_titles[tt] + 1 except: self.paper_titles[tt] = 1 print "Done" f.close()
def classify_catagory(train, test): print("Train-test split") trainX, testX, trainY, testY = train_test_split(train, test, random_state=1) print "TrainX size = ", str(trainX.shape) print "TestX size = ", str(testX.shape) classifier = GradientBoostingClassifier(n_estimators=1024, random_state=1, subsample=.8, min_samples_split=10, max_depth=6, verbose=3) classifier.fit(trainX, trainY) print "Score = ", classifier.score(testX, testY) feature_importrance = classifier.feature_importances_ logger = open(data_io.get_paths()["feature_importance_path"], "a") for fi in feature_importrance: logger.write(str(fi)) logger.write("\n") logger.write("###########################################\n") logger.close() return classifier
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')): print 'Feature file already exists - not overwriting' return features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB), ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('add_noise', all_features, feature_names)
def parse_authors(self): # Create authors print "Parsing Authors..." f = open(data_io.get_paths()["author_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") # Titles raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) (name, surname) = nlp.filter_title(raw_title) try: self.surnames[surname] = self.surnames[surname] + 1 except: self.surnames[surname] = 1 #Affiliations raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8")) affiliation = nlp.filter_affiliation(raw_affiliation) try: self.affiliations[affiliation] = self.affiliations[affiliation] + 1 except: self.affiliations[affiliation] = 1 self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation) print "Done" f.close()
def parse_authors(self): # Create authors print "Parsing Authors..." f = open(data_io.get_paths()["author_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") # Titles raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) (name, surname) = nlp.filter_title(raw_title) try: self.surnames[surname] = self.surnames[surname] + 1 except: self.surnames[surname] = 1 #Affiliations raw_affiliation = unidecode.unidecode( unicode(res[2], encoding="utf-8")) affiliation = nlp.filter_affiliation(raw_affiliation) try: self.affiliations[ affiliation] = self.affiliations[affiliation] + 1 except: self.affiliations[affiliation] = 1 self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation) print "Done" f.close()
def parse_paperauthors(self): # Update all journal/conference/coauthor information print "Parsing PaperAuthors..." f = open(data_io.get_paths()["paperauthor_processed_path"], "r") titles = f.readline() count = 0 for l in f: count += 1 if count % 100000 == 0: print count res = l.strip().split(",") if not res[0].isdigit(): continue paper_id = int(res[0]) author_id = int(res[1]) raw_author_name = unidecode.unidecode( unicode(res[2], encoding="utf-8")) author_name = nlp.filter_title(raw_author_name)[0] raw_author_affiliation = unidecode.unidecode( unicode(res[3], encoding="utf-8")) author_affiliation = nlp.filter_affiliation(raw_author_affiliation) curr_paper = self.papers.get(paper_id) curr_author = self.authors.get(author_id) self.update_paperauthor(curr_paper, curr_author, author_id, author_name, author_affiliation) print "Done" f.close()
def train(f, file_path): file_pt = open(file_path, "r") title = file_pt.readline() ret = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if ret == None: ret = fet elif fet != None: ret = numpy.vstack((ret, fet)) print ret.shape # classifier = RandomForestClassifier(n_estimators=100, # verbose=2, # n_jobs=1, # min_samples_split=10, # random_state=1) classifier = GradientBoostingClassifier(n_estimators=512, verbose=3, max_depth=6, min_samples_split=10, subsample=0.8, random_state=1) valid_ret = validate(f, data_io.get_paths()["valid_sol_path"], classifier) ret = numpy.vstack((ret, valid_ret)) print "Final size: ", ret.shape trainX, testX, trainY, testY = train_test_split(ret[:, 3:], ret[:, 0], random_state=1) classifier.fit(trainX, trainY) numpy.savetxt(data_io.get_paths()["feature_path"], ret.astype(float), fmt='%f', delimiter=",") print classifier.score(testX, testY) #validate(f, data_io.get_paths()["valid_sol_path"], classifier) print classifier.score(valid_ret[:, 3:], valid_ret[:, 0]) return classifier
def main(): submission_path = data_io.get_paths()["submission_path"] reader = csv.reader(open(submission_path)) reader.next() # skipping the header recommendations = [(int(row[0]), int(row[1]), -i) for i,row in enumerate(reader)] out_path = submission_path[:-4]+"Reversed.csv" data_io.write_submission(recommendations, submission_path=out_path)
def main(): submission_path = data_io.get_paths()["submission_path"] reader = csv.reader(open(submission_path)) reader.next() # skipping the header recommendations = [(int(row[0]), int(row[1]), -i) for i, row in enumerate(reader)] out_path = submission_path[:-4] + "Reversed.csv" data_io.write_submission(recommendations, submission_path=out_path)
def paper_keywords(data): paper = data['paper'] paperid = list(paper["Id"]) paper_keyword = defaultdict(list) paper = paper.set_index("Id") paper['Keyword'] = paper['Keyword'].fillna("") paper['Title'] = paper['Title'].fillna("") title = list(paper["Title"]) cnt = 0 start_time = time.time() titleTokens = [] print("Start title!!!") for t in title: cnt += 1 if (cnt % 100000 == 0): print("Count: ", cnt) print("Time: ", time.time() - start_time) titleTokens.append(tokenize(t)) paper['Token'] = titleTokens #paper['Token'] = paper.Title.map(tokenize) print("Start keyword!!!") keywords = list(paper['Keyword']) cnt2 = 0 keywordTokens = [] for k in keywords: cnt2 += 1 if (cnt2 % 100000 == 0): print("Count: ", cnt2) print("Time: ", time.time() - start_time) keywordTokens.append(filter_keyword(k)) paper['Keyword_pro'] = keywordTokens print("Start concatenation!!!") #TODO: change all "apply", "map" functions to explicit for loop. Don't use "for loop" in the list because it causes memory limit error. #concatenate keyword and token keyToken = [] for i in paperid: keyToken.append( list(set(paper.loc[i, 'Keyword_pro'] + paper.loc[i, 'Token']))) paper['Key_token'] = keyToken for i in paperid: paper_keyword[i] = paper.loc[i, 'Key_token'] pickle.dump(paper_keyword, open(data_io.get_paths()["paper_title_tokens"], 'wb')) print("Process done, time: ", time.time() - start_time) return paper_keyword
def predict(f, classifier, file_path): file_pt = open(file_path, "r") title = file_pt.readline() output = open(data_io.get_paths()["submission_path"], "a") tot_fet = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if tot_fet == None: tot_fet = fet else: tot_fet = numpy.vstack((tot_fet, fet)) pred = classifier.predict_proba(fet[:, 3:]) sorted_pred = sorted(zip(res[1].split(), pred[:, 1]), key=lambda a:a[1], reverse=True) #print sorted_pred output.write(res[0] + "," + " ".join(map(lambda a: a[0], sorted_pred)) + "\n") output.close() numpy.savetxt(data_io.get_paths()["test_feature_path"], tot_fet.astype(float), fmt='%f', delimiter=",")
def train(f, file_path): file_pt = open(file_path, "r") title = file_pt.readline() ret = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if ret == None: ret = fet elif fet != None: ret = numpy.vstack((ret, fet)) print ret.shape # classifier = RandomForestClassifier(n_estimators=100, # verbose=2, # n_jobs=1, # min_samples_split=10, # random_state=1) classifier = GradientBoostingClassifier(n_estimators=512, verbose=3, max_depth=6, min_samples_split=10, subsample=0.8, random_state=1) valid_ret = validate(f, data_io.get_paths()["valid_sol_path"], classifier) ret = numpy.vstack( (ret, valid_ret) ) print "Final size: ", ret.shape trainX, testX, trainY, testY = train_test_split(ret[:, 3:], ret[:, 0], random_state=1) classifier.fit(trainX, trainY) numpy.savetxt(data_io.get_paths()["feature_path"], ret.astype(float), fmt='%f', delimiter=",") print classifier.score(testX, testY) #validate(f, data_io.get_paths()["valid_sol_path"], classifier) print classifier.score(valid_ret[:, 3:], valid_ret[:, 0]) return classifier
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')): print 'Feature file already exists - not overwriting' return features = [('A: Normalized Entropy', 'A', f.normalized_entropy), ('B: Normalized Entropy', 'B', f.normalized_entropy), ('Pearson R', ['A','B'], f.correlation), ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'), ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'), ('Spearman rank correlation', ['A','B'], f.rcorrelation), ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'), ('Kurtosis A', 'A', f.fkurtosis), ('Kurtosis B', 'B', f.fkurtosis), ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'), ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'), ('Unique ratio A', 'A', f.unique_ratio), ('Unique ratio B', 'B', f.unique_ratio), ('Skew A', 'A', f.fskew), ('Skew B', 'B', f.fskew), ('Skew difference', 'derived', 'output[key][14] - output[key][15]'), ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'), ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'), ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'), ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')): print 'Feature file already exists - not overwriting' return features = [('Moment 5 A', 'A', f.standard_moment_5), ('Moment 5 B', 'B', f.standard_moment_5), ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'), ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'), ('Moment 6 A', 'A', f.standard_moment_6), ('Moment 6 B', 'B', f.standard_moment_6), ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'), ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'), ('Moment 7 A', 'A', f.standard_moment_7), ('Moment 7 B', 'B', f.standard_moment_7), ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'), ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'), ('Moment 8 A', 'A', f.standard_moment_8), ('Moment 8 B', 'B', f.standard_moment_8), ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'), ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'), ('Moment 9 A', 'A', f.standard_moment_9), ('Moment 9 B', 'B', f.standard_moment_9), ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'), ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('high_order_moments', all_features, feature_names)
def parse_conferences(self): print "Parsing Conferences..." f = open(data_io.get_paths()["conference_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") conference_id = int(res[0]) raw_conference_title = unidecode.unidecode(unicode(res[2], encoding="utf-8")) conference_title = nlp.filter_paper_title(raw_conference_title) self.conferences[conference_id] = conference_title for c in conference_title.split(): if c in self.conference_freq.keys(): self.conference_freq[c] = self.conference_freq[c] + 1 else: self.conference_freq[c] = 1
def parse_journals(self): print "Parsing Journals..." f = open(data_io.get_paths()["journal_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") journal_id = int(res[0]) raw_journal_title = unidecode.unidecode(unicode(res[2], encoding="utf-8")) journal_title = nlp.filter_paper_title(raw_journal_title) self.journals[journal_id] = journal_title for j in journal_title.split(): if j in self.journal_freq.keys(): self.journal_freq[j] = self.journal_freq[j] + 1 else: self.journal_freq[j] = 1
def predict(f, classifier, file_path): file_pt = open(file_path, "r") title = file_pt.readline() output = open(data_io.get_paths()["submission_path"], "a") tot_fet = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if tot_fet == None: tot_fet = fet else: tot_fet = numpy.vstack((tot_fet, fet)) pred = classifier.predict_proba(fet[:, 3:]) sorted_pred = sorted(zip(res[1].split(), pred[:, 1]), key=lambda a: a[1], reverse=True) #print sorted_pred output.write(res[0] + "," + " ".join(map(lambda a: a[0], sorted_pred)) + "\n") output.close() numpy.savetxt(data_io.get_paths()["test_feature_path"], tot_fet.astype(float), fmt='%f', delimiter=",")
def parse_conferences(self): print "Parsing Conferences..." f = open(data_io.get_paths()["conference_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") conference_id = int(res[0]) raw_conference_title = unidecode.unidecode( unicode(res[2], encoding="utf-8")) conference_title = nlp.filter_paper_title(raw_conference_title) self.conferences[conference_id] = conference_title for c in conference_title.split(): if c in self.conference_freq.keys(): self.conference_freq[c] = self.conference_freq[c] + 1 else: self.conference_freq[c] = 1
def parse_journals(self): print "Parsing Journals..." f = open(data_io.get_paths()["journal_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") journal_id = int(res[0]) raw_journal_title = unidecode.unidecode( unicode(res[2], encoding="utf-8")) journal_title = nlp.filter_paper_title(raw_journal_title) self.journals[journal_id] = journal_title for j in journal_title.split(): if j in self.journal_freq.keys(): self.journal_freq[j] = self.journal_freq[j] + 1 else: self.journal_freq[j] = 1
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')): print 'Feature file already exists - not overwriting' return features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB), ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA), ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'), ('ICGI slope AB', ['A','B'], f.icgi_slope_AB), ('ICGI slope BA', ['A','B'], f.icgi_slope_BA), ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#, #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT), #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT), #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'), #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT), #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT), #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('icgi', all_features, feature_names)
def parse_papers(self): # Create Papers print "Parsing Papers..." f = open(data_io.get_paths()["paper_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") paper_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) title_words = nlp.filter_paper_title(paper_title) paper_keyword = unidecode.unidecode(unicode(res[5], encoding="utf-8")) filtered_keyword = nlp.filter_paper_keyword(paper_keyword) self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words, int(res[2]), int(res[3]), int(res[4]), filtered_keyword) for tt in title_words.split(): try: self.paper_titles[tt] = self.paper_titles[tt] + 1 except: self.paper_titles[tt] = 1 print "Done" f.close()
def main(): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')): print 'Feature file already exists - not overwriting' return features = [('Number of Samples', 'A', len), ('Max A', 'A', max), ('Max B', 'B', max), ('Min A', 'A', min), ('Min B', 'B', min), ('Mean A', 'A', f.mean), ('Mean B', 'B', f.mean), ('Median A', 'A', f.median), ('Median B', 'B', f.median), ('Sd A', 'A', f.sd), ('Sd B', 'B', f.sd)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('unreasonable_features', all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')): print 'Feature file already exists - not overwriting' return features = [('Kendall tau', ['A','B'], f.kendall), ('Kendall tau p', ['A','B'], f.kendall_p), ('Mann Whitney', ['A','B'], f.mannwhitney), ('Mann Whitney p', ['A','B'], f.mannwhitney_p), #('Wilcoxon', ['A','B'], f.wilcoxon), #('Wilcoxon p', ['A','B'], f.wilcoxon_p), ('Kruskal', ['A','B'], f.kruskal), ('Kruskal p', ['A','B'], f.kruskal_p), ] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('corrs', all_features, feature_names)
def prediction(n_train_samples): proc_test_samples_file = get_paths()["proc_test_samples_path"] if os.path.exists(proc_test_samples_file): print "Loading processed test data..." new_test_samples = pd.read_csv(proc_test_samples_file) else: print "Reading test data..." test_samples = data_io.read_test() test_samples = test_samples.fillna(value=0) print "Porcessing test samples" new_test_samples = process_test_samples(test_samples) new_test_samples.to_csv(proc_test_samples_file, index=None) test_feature = new_test_samples.values print "Loading the Random Forest Classifier" rf_classifier = data_io.load_model(model_name="rf_classifier.pkl") print "Random Forest Predicting" rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1] print "Loading the Gradient Boosting Classifier" gb_classifier = data_io.load_model(model_name="gb_classifier.pkl") print "Gradient Boosting Predicting" gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1] print "Loading the SGD Classifier" sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl") print "SGD Predicting" sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1] prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) mean_score = np.mean(prob_arr, axis=0) mean_score = -1.0 * mean_score mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score) print "Writing predictions to file" data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)
def parse_paperauthors(self): # Update all journal/conference/coauthor information print "Parsing PaperAuthors..." f = open(data_io.get_paths()["paperauthor_processed_path"], "r") titles = f.readline() count = 0 for l in f: count += 1 if count % 100000 == 0: print count res = l.strip().split(",") if not res[0].isdigit(): continue paper_id = int(res[0]) author_id = int(res[1]) raw_author_name = unidecode.unidecode(unicode(res[2], encoding="utf-8")) author_name = nlp.filter_title(raw_author_name)[0] raw_author_affiliation = unidecode.unidecode(unicode(res[3], encoding="utf-8")) author_affiliation = nlp.filter_affiliation(raw_author_affiliation) curr_paper = self.papers.get(paper_id) curr_author = self.authors.get(author_id) self.update_paperauthor(curr_paper, curr_author, author_id, author_name, author_affiliation) print "Done" f.close()
def classify_catagory(train, test): print("Train-test split") trainX, testX, trainY, testY = train_test_split(train, test, random_state = 1) print "TrainX size = ", str(trainX.shape) print "TestX size = ", str(testX.shape) classifier = GradientBoostingClassifier(n_estimators=1024, random_state = 1, subsample = .8, min_samples_split=10, max_depth = 6, verbose=3) classifier.fit(trainX, trainY) print "Score = ", classifier.score(testX, testY) feature_importrance = classifier.feature_importances_ logger = open(data_io.get_paths()["feature_importance_path"], "a") for fi in feature_importrance: logger.write(str(fi)) logger.write("\n") logger.write("###########################################\n") logger.close() return classifier
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')): print 'Feature file already exists - not overwriting' return features = [('Injectivity 10', ['A','B'], f.injectivity_10), ('Injectivity 15', ['A','B'], f.injectivity_15), ('Injectivity 20', ['A','B'], f.injectivity_20), ('Injectivity 25', ['A','B'], f.injectivity_25), ('Injectivity 30', ['A','B'], f.injectivity_30), ('Injectivity 35', ['A','B'], f.injectivity_35), ('Injectivity 40', ['A','B'], f.injectivity_40)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('injectivity', all_features, feature_names)
def main(): data = pickle.load(open(data_io.get_paths()["valid_features"], 'rb')) predict_write(data, "valid")
import random from data_io import get_paths from os.path import join as path_join import os try: P_train = float(sys.argv[5]) P_validation = float(sys.argv[6]) except IndexError: P_train = 0.6 P_validation = 0.2 print "P train = %s %%" % (P_train * 100) print "P validation = %s %%" % (P_validation * 100) print "P test = %s %%" % ((1 - P_validation - P_train) * 100) paths = get_paths("Settings_submission.json") input_file = sys.argv[1] output_file1 = path_join(paths["data_path"], "data/processed", sys.argv[2]) output_file2 = path_join(paths["data_path"], "data/processed", sys.argv[3]) output_file3 = path_join(paths["data_path"], "data/processed", sys.argv[4]) print "Input: %s " % input_file print "Train file: %s " % output_file1 print "Validation file: %s " % output_file2 print "Test file: %s " % output_file3 run = raw_input("OK (Y/N)?") print run if run != "Y": os.exit()
def parse_conference(file): parse(file, data_io.get_paths()["conference_processed_path"])
print "Loading the Random Forest Classifier" rf_classifier = data_io.load_model(model_name="rf_classifier.pkl") print "Random Forest Predicting" rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1] print "Loading the Gradient Boosting Classifier" gb_classifier = data_io.load_model(model_name="gb_classifier.pkl") print "Gradient Boosting Predicting" gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1] print "Loading the SGD Classifier" sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl") print "SGD Predicting" sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1] prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) mean_score = np.mean(prob_arr, axis=0) mean_score = -1.0 * mean_score mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score) print "Writing predictions to file" data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples) if __name__ == "__main__": n_train_samples = 8930723 save_train_sample_file = "proc_train_sample_%i.csv" % n_train_samples processed_train_csv_file = os.path.join(get_paths()["proc_train_path"], save_train_sample_file) training(processed_train_csv_file) prediction(n_train_samples)
def parse_journal(file): parse(file, data_io.get_paths()["journal_processed_path"])
def do_prediction(n_trian_samples): proc_test_samples_file = get_paths()['proc_test_samples_path'] if os.path.exists(proc_test_samples_file): print "Loading processed test data..." new_test_samples = pd.read_csv(proc_test_samples_file) print "Loading processed test data done" else: # prediction print "reading test data..." test_samples = data_io.read_test() test_samples = test_samples.fillna(value=0) print "done." # process test samples print "processing test data..." new_test_samples = process_test_samples(test_samples) new_test_samples.to_csv(proc_test_samples_file, index=None) print "Processing test data done." test_features = new_test_samples.values # 5.1 random forest prediction print("Loading the random forest classifier") rf_classifier = data_io.load_model(model_name='lr_classifier.pkl') print("random forest Predicting") # 拿概率值 rf_predictions = rf_classifier.predict_proba(test_features)[:1] # 5.2 Gradient Boosting prediction print("Loading the Gradient Boosting classifier") gb_classifier = data_io.load_model(model_name='gb_classifier.pkl') print("Gradient Boosting Predicting") gb_predictions = gb_classifier.predict_proba(test_features)[:1] # 5.3 SGD prediction print("Loading the SGD classifier") sgd_classifier = data_io.load_model(model_name='sgd_classifier.pkl') print("SGD Predicting") sgd_predictions = sgd_classifier.predict_proba(test_features)[:1] # 5.4 LR prediction # print("Loading the LR classifier") # lr_classifier = data_io.load_model(model_name='lr_classifier.pkl') # print("Logistic Regression Predicting") # lr_predictions = lr_classifier.predict_proba(test_features)[:1] # step 6 score fusion 把三组概率放到数组中 prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) # average mean 取概率的评价值 算术评价 mean_score = np.mean(prob_arr, axis=0) # for sorting 几何评价,效果不太好 # 前面的排序是升序,乘以-1 改为降序 mean_score = -1.0 * mean_score # geometric mean gmean = stats.gmean(prob_arr, axis=0) # for sorting gmean = -1.0 * gmean # step 7 output result mean_recommendations = zip(new_test_samples['srch_id'], new_test_samples['prop_id'], mean_score) gmean_recommendations = zip(new_test_samples['srch_id'], new_test_samples['prop_id'], gmean) print("Writing predictions to file") data_io.write_submission(mean_recommendations, submission_file='mean_result_%i.csv' % n_trian_samples) data_io.write_submission(gmean_recommendations, submission_file='gmean_result_%i.csv' % n_trian_samples)
from os.path import join as path_join import re #import string import logging from nltk.tokenize.regexp import WordPunctTokenizer from nltk.corpus import stopwords from itertools import izip, repeat import operator import joblib from collections import Counter from gensim.corpora.textcorpus import TextCorpus from gensim.corpora.dictionary import Dictionary from gensim.corpora.mmcorpus import MmCorpus paths = get_paths("Settings_loc5.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") tmp_dir = path_join(data_dir, "tmp") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('test_miislita') regex = re.compile("^(.+;\s*)\n", re.MULTILINE) regex_num = re.compile("\W(\d+)\W") regex_hex = re.compile("\W([a-f0-9-]+)\W") regex_punct = re.compile("^([:/\]\[\)\(,\.\}\{\?\!#=@'0-9]+)$") english_stopwords = stopwords.words('english') english_stopwords = dict(izip(english_stopwords, repeat(True, len(english_stopwords)))) #tokenizer = WordPunctTokenizer()
#validate(f, data_io.get_paths()["valid_sol_path"], classifier) print classifier.score(valid_ret[:, 3:], valid_ret[:, 0]) return classifier def predict(f, classifier, file_path): file_pt = open(file_path, "r") title = file_pt.readline() output = open(data_io.get_paths()["submission_path"], "a") tot_fet = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if tot_fet == None: tot_fet = fet else: tot_fet = numpy.vstack((tot_fet, fet)) pred = classifier.predict_proba(fet[:, 3:]) sorted_pred = sorted(zip(res[1].split(), pred[:, 1]), key=lambda a:a[1], reverse=True) #print sorted_pred output.write(res[0] + "," + " ".join(map(lambda a: a[0], sorted_pred)) + "\n") output.close() numpy.savetxt(data_io.get_paths()["test_feature_path"], tot_fet.astype(float), fmt='%f', delimiter=",") if __name__ == "__main__": p = parser.Parser() p.parse_csv() f = feature.Feature(p) classifier = train(f, data_io.get_paths()["train_path"]) predict(f, classifier, data_io.get_paths()["valid_path"])
def main(): ''' print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") ''' features_deleted = pickle.load( open(data_io.get_paths()["deleted_features"], 'rb')) features_conf = pickle.load( open(data_io.get_paths()["confirmed_features"], 'rb')) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] print("Training the Classifier") features = np.array(features) target = np.array(target) ''' classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) ''' #Referred https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ for parameter tuning param_test1 = {'max_depth': [19], 'min_child_weight': [1]} param_test2 = {'gamma': [i / 10.0 for i in range(0, 5)]} param_test3 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } ''' gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=19, min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9, objective='binary:logistic', scale_pos_weight=1, seed=27), param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch1.fit(features, target) print(gsearch1.grid_scores_) print(gsearch1.best_params_) print(gsearch1.best_score_) exit() ''' ''' classifier = xgb.XGBClassifier(learning_rate=0.03, n_estimators=300, max_depth=19, min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9, objective='binary:logistic', seed=27).fit(features, target) ''' ''' classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1).fit(features, target) ''' ''' print(len(features)) a = np.random.permutation(len(features))[0:10000] features = features[a] target = target[a] classifier = svm.SVC(probability=True).fit(features, target) ''' #classifier = GaussianNB().fit(features, target) classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit( features, target) print("Saving the classifier") data_io.save_model(classifier) # accuracy 0.9729 for valid set #classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit(features, target) ''' accuracy 0.9723 for valid set
# Create Papers print "Parsing Papers..." cursor.execute("SELECT * from Paper;") for res in cursor: self.papers[res[0]] = paper.Paper(res[0], res[1], res[2], res[3], res[4], res[5]) print "Done" # First Update all journal/conference/coauthor information print "Parsing PaperAuthors..." cursor.execute("SELECT * from PaperAuthor;") for res in cursor: paper_id = res[0] author_id = res[1] curr_author = None curr_paper = None if paper_id in self.papers.keys(): curr_paper = self.papers[paper_id] if author_id in self.authors.keys(): curr_author = self.authors[author_id] self.update_paperauthor(curr_paper, curr_author, author_id) print "Done" if __name__ == "__main__": p = Parser() p.parse_csv() with open(data_io.get_paths()["parser_path"], "wb") as output: pickle.dump(p, output)
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000): ''' func: Process samples including feature extraction and downsampling MB:samples with the same srch_id that have one positive traget are treated as one positive sample,otherwise,are negative samples max_srch_size 就是每一个srch_id最多有几行数据, 比如 train数据集中,一个srch_id有20行,但是我们只随机取到10行就可以了,这就做了个downsampling 因为训练集很大,所以就每处理100万条,就生成一个文件,并训练 ''' # 训练集数据乱序,这里先拍下序,相同srch_id的数据放一块 sorted_samples = samples.sort_values(by=['srch_id']) # grou by srch_id sorted_samples = sorted_samples.reset_index(drop=True) # reset row index processed_samples = pd.DataFrame() samples_in_one_srch = pd.DataFrame() # for 循环处理的就是下一个srch_id是不是与上一个相同 for r_idx, sample in sorted_samples.iterrows(): if (r_idx + 1) % 1000 == 0: print "processed %i sample of %i " % (r_idx + 1, sorted_samples.shape[0]) is_next_in_same_search = True samples_in_one_srch = pd.concat((sample.to_frame().transpose(), samples_in_one_srch), axis=0) current_srch_id = sample['srch_id'] # 最后一行 if (r_idx + 1) == sorted_samples.shape[0]: is_next_in_same_search = False else: next_srch_id = sorted_samples['srch_id'][r_idx + 1] if current_srch_id != next_srch_id: is_next_in_same_search = False # 正好是一组srch_id ,进行特征提取 # 16G内存,跑了8小时,这部分处理速度慢 if not is_next_in_same_search: ## if next one is not in the same search process the samples in the same search # feature extraction for samples ext_samples_in_one_srch = extract_features(samples_in_one_srch) # downsample samples 同一个srch_id下有多少samples n_samples = ext_samples_in_one_srch.shape[0] # 比如设定为10,这里大于10 if n_samples > max_srch_size: # if too many samples in one search,do downsampling if np.any(ext_samples_in_one_srch['booking_bool']): # if this is a positive sample(1 exists in booking_bool) # 有预定酒店的数据 正样本需要留下了 pos_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 1] neg_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 0] # 然后在负样本里,随机选择 eg: samples 28条数据, 设定为10 ,有1条正样本,则在剩下的数据中随机选择9条 selected_neg_samples = neg_samples.sample(n=max_srch_size - pos_samples.shape[0]) selected_samples = pd.concat((pos_samples, selected_neg_samples), axis=0) else: # 没有正样本数据,就都随机选择了 # if this is a negative sample,random select max_srch_size selected_samples = ext_samples_in_one_srch.sample(n=max_srch_size) else: # selected_samples = ext_samples_in_one_srch.copy() processed_samples = pd.concat((processed_samples, selected_samples), axis=0) # create new samples for the next search samples_in_one_srch = pd.DataFrame() # 每100万条,存储下来 if (r_idx + 1) % each_saved_size == 0: # save samples for every each_saved_size save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1) save_path = get_paths()['proc_train_path'] if not os.path.exists(save_path): os.mkdir(save_path) if np.any(np.isnan(processed_samples.values)): # remove nan processed_samples = processed_samples.fillna(value=0) print "remove nan." processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None) # out of loop save all processed samples save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1) save_path = get_paths()['proc_train_path'] if np.any(np.isnan(processed_samples.values)): # remove nan processed_samples = processed_samples.fillna(value=0) print "remove nan." processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None)
from sklearn.metrics import mean_absolute_error from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor, export_graphviz from sklearn.ensemble import ExtraTreesRegressor from sklearn.linear_model import SGDRegressor from sklearn.preprocessing import StandardScaler import joblib def log_mean_absolute_error(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) paths = get_paths("Settings_loc5.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") names = ["Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName"] le_features = map(lambda x: label_encode_column_fit_only( x, file_id="train_full_data_path", type_n="train_full"), names) features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="train_data_path", type_n="train"), zip(le_features, names)) description_length = map(len, read_column(paths["train_data_path"], "FullDescription")) title_length = map(len, read_column(paths["train_data_path"], "Title")) features.append(description_length)
title = file_pt.readline() output = open(data_io.get_paths()["submission_path"], "a") tot_fet = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if tot_fet == None: tot_fet = fet else: tot_fet = numpy.vstack((tot_fet, fet)) pred = classifier.predict_proba(fet[:, 3:]) sorted_pred = sorted(zip(res[1].split(), pred[:, 1]), key=lambda a: a[1], reverse=True) #print sorted_pred output.write(res[0] + "," + " ".join(map(lambda a: a[0], sorted_pred)) + "\n") output.close() numpy.savetxt(data_io.get_paths()["test_feature_path"], tot_fet.astype(float), fmt='%f', delimiter=",") if __name__ == "__main__": p = parser.Parser() p.parse_csv() f = feature.Feature(p) classifier = train(f, data_io.get_paths()["train_path"]) predict(f, classifier, data_io.get_paths()["valid_path"])
def parse_paperauthor(file): parse(file, data_io.get_paths()["paperauthor_processed_path"])
label_encode_column_fit, label_encode_column_transform, load_predictions, fit_predict, write_submission) from os.path import join as path_join #import joblib import numpy as np from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error from sklearn.feature_extraction.text import CountVectorizer import joblib def log_mean_absolute_error(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) paths = get_paths("Settings_submission.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") le_category, category_train = label_encode_column_fit("Category") category_valid = label_encode_column_transform(le_category, "Category") le_contractTime, contractTime_train = label_encode_column_fit("ContractTime") contractTime_valid = label_encode_column_transform(le_contractTime, "ContractTime") le_contractType, contractType_train = label_encode_column_fit("ContractType") contractType_valid = label_encode_column_transform(le_contractType, "ContractType") features = join_features(
import joblib from data_io import (write_submission, get_paths) import numpy as np from os.path import join as path_join paths = get_paths("Settings.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") model_name = "vowpal_submission" type_n = "submission" predictions = joblib.load( path_join(prediction_dir, model_name + "_prediction_" + type_n)) model_name = "vowpal_submission_round" predictions = np.exp(predictions) predictions = predictions / 1000 #print predictions[1:10] predictions = np.round(predictions) * 1000 joblib.dump(predictions, path_join(prediction_dir, model_name + "_prediction_" + type_n)) write_submission("vowpal_fastml_round.csv", "vowpal_submission_round_prediction_submission", unlog=False)
# step 6 score fusion 把三组概率放到数组中 prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) # average mean 取概率的评价值 算术评价 mean_score = np.mean(prob_arr, axis=0) # for sorting 几何评价,效果不太好 # 前面的排序是升序,乘以-1 改为降序 mean_score = -1.0 * mean_score # geometric mean gmean = stats.gmean(prob_arr, axis=0) # for sorting gmean = -1.0 * gmean # step 7 output result mean_recommendations = zip(new_test_samples['srch_id'], new_test_samples['prop_id'], mean_score) gmean_recommendations = zip(new_test_samples['srch_id'], new_test_samples['prop_id'], gmean) print("Writing predictions to file") data_io.write_submission(mean_recommendations, submission_file='mean_result_%i.csv' % n_trian_samples) data_io.write_submission(gmean_recommendations, submission_file='gmean_result_%i.csv' % n_trian_samples) if __name__ == "__main__": n_train_samples = 8930723 saved_train_sample_file = 'proc_train_samples_%i.csv' % n_train_samples processed_train_csv_file = os.path.join(get_paths()['proc_train_path'], saved_train_sample_file) do_training(processed_train_csv_file) do_prediction(n_train_samples)
from scipy.optimize import curve_fit # _________________________________________________________________________________________________________ # test_person = 'me' #est_person = 'me02' test_person = 'gen' # test_person = 'marie01' # test_person = 'marie02' footage_file_name = test_person + '.0001.mov' render_file_name = test_person + '_tracked_calib.0001.avi' render_l_eye_file_name = test_person + '_eye_l_tracked.0001.avi' render_r_eye_file_name = test_person + '_eye_r_tracked.0001.avi' footage_folder, render_folder, config_path, data_path = data_io.get_paths( test_person) print('footage folder:' + footage_folder) print('render folder:' + render_folder) predictor_file_path = config_path + 'shape_predictor_68_face_landmarks.dat' lmarks_left_eye = [42, 43, 44, 45, 46, 47] # starting from inner corner -> up lmarks_right_eye = [36, 37, 38, 39, 40, 41] # starting from outer corner -> up curr_lmpoints_r_eye_x = [] curr_lmpoints_r_eye_y = [] curr_lmpoints_l_eye_x = [] curr_lmpoints_l_eye_y = [] roi_eye_offset_x = 260
import joblib from data_io import ( write_submission, get_paths ) import numpy as np from os.path import join as path_join paths = get_paths("Settings.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") model_name = "vowpal_submission" type_n = "submission" predictions = joblib.load(path_join(prediction_dir, model_name + "_prediction_" + type_n)) model_name = "vowpal_submission_round" predictions = np.exp(predictions) predictions = predictions / 1000 #print predictions[1:10] predictions = np.round(predictions) * 1000 joblib.dump(predictions, path_join(prediction_dir, model_name + "_prediction_" + type_n)) write_submission("vowpal_fastml_round.csv", "vowpal_submission_round_prediction_submission", unlog=False)