def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:,1] paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:, 1] paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb;pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)