def parse_db(self):
        conn = data_io.get_db_conn()
        cursor = conn.cursor()

        # Create authors
        print "Parsing Authors..."
        cursor.execute("SELECT * from Author;")
        for res in cursor:
            self.authors[res[0]] = author.Author(res[0], res[1], res[2])
        print "Done"

        # Create Papers
        print "Parsing Papers..."
        cursor.execute("SELECT * from Paper;")
        for res in cursor:
            self.papers[res[0]] = paper.Paper(res[0], res[1], res[2], res[3], res[4], res[5])
        print "Done"
                
        # First Update all journal/conference/coauthor information
        print "Parsing PaperAuthors..."
        cursor.execute("SELECT * from PaperAuthor;")
        for res in cursor:
            paper_id = res[0]
            author_id = res[1]
            curr_author = None
            curr_paper = None
            if paper_id in self.papers.keys():
                curr_paper = self.papers[paper_id]
            if author_id in self.authors.keys():
                curr_author = self.authors[author_id]
            self.update_paperauthor(curr_paper, curr_author, author_id)
        print "Done"
Esempio n. 2
0
    def parse_db(self):
        conn = data_io.get_db_conn()
        cursor = conn.cursor()

        # Create authors
        print "Parsing Authors..."
        cursor.execute("SELECT * from Author;")
        for res in cursor:
            self.authors[res[0]] = author.Author(res[0], res[1], res[2])
        print "Done"

        # Create Papers
        print "Parsing Papers..."
        cursor.execute("SELECT * from Paper;")
        for res in cursor:
            self.papers[res[0]] = paper.Paper(res[0], res[1], res[2], res[3],
                                              res[4], res[5])
        print "Done"

        # First Update all journal/conference/coauthor information
        print "Parsing PaperAuthors..."
        cursor.execute("SELECT * from PaperAuthor;")
        for res in cursor:
            paper_id = res[0]
            author_id = res[1]
            curr_author = None
            curr_paper = None
            if paper_id in self.papers.keys():
                curr_paper = self.papers[paper_id]
            if author_id in self.authors.keys():
                curr_author = self.authors[author_id]
            self.update_paperauthor(curr_paper, curr_author, author_id)
        print "Done"
def main():
    comm = MPI.COMM_WORLD
    size = comm.Get_size()   
    rank = comm.Get_rank()
    conn = data_io.get_db_conn()
    feature_name = open("feature_list.txt").read().split()
    # if size < len(feature_name):	# to be done!
    for table_name in ["ValidPaper"]:
	if rank > 0:
            # getting features by parallel computing
	    print "getting features at node " + str(rank)
            feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1])
	else:
	    feature = data_io_parallel.get_trained_validation_data(conn, table_name)
	    
	# sending features to rank 0
	print "sending features to node " + str(rank)
	features = comm.gather(feature, root = 0)
        #print features
	if rank == 0:	  
	    temp = []
	    for f in features:
		temp.extend(f)  	    
	    print "Successfully got the features from " + table_name
	    data = map(list, np.array(temp).T)
    
    if rank == 0:
	author_paper_ids = [x[:2] for x in data]
	features = [x[2:] for x in data]

	print("Loading the classifier")
	classifier = data_io.load_model()
	print classifier.feature_importances_

	print("Making predictions")
	predictions = classifier.predict_proba(features)[:,1]
	predictions = list(predictions)

	author_predictions = defaultdict(list)
	paper_predictions = {}

	for (a_id, p_id), pred in zip(author_paper_ids, predictions):
	    author_predictions[a_id].append((pred, p_id))

	for author_id in sorted(author_predictions):
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

	print("Writing predictions to file")
	data_io.write_submission(paper_predictions)
	print "Prediction completed, exit..."
        comm.Abort()
Esempio n. 4
0
def main():
    comm = MPI.COMM_WORLD
    size = comm.Get_size()   
    rank = comm.Get_rank()
    conn = data_io.get_db_conn()
    feature_name = open("feature_list.txt").read().split()
    # if size < len(feature_name):	# to be done!
    for table_name in ["TrainDeleted", "TrainConfirmed"]:
	if rank > 0:
            # getting features by parallel computing
	    print "getting features at node " + str(rank)
            feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1])
	else:
	    feature = data_io_parallel.get_trained_validation_data(conn, table_name)
	    
	# sending features to rank 0
	print "sending features to node " + str(rank)
	features = comm.gather(feature, root = 0)
        #print features
	if rank == 0:	  
	    temp = []
	    for f in features:
		temp.extend(f)  	    
	    print "Successfully got the features from " + table_name
	    if table_name == "TrainDeleted":
		features_deleted = map(list, np.array(temp).T)
	    else:
		features_conf = map(list, np.array(temp).T)

    if rank == 0:
	features = [x[2:] for x in features_deleted + features_conf]
	target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]
    	print("Training the Classifier")
    	classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    	classifier.fit(features, target)
    
    	print("Saving the classifier")
    	data_io.save_model(classifier)
        print "Training completed, exit..."
        comm.Abort()
def main():
    conn = data_io.get_db_conn()
    cursor = conn.cursor()
    if not data_io.table_view_existence_db('AP_features', conn):
	query = """
		CREATE TABLE AP_features (
		Result int, authorid bigint, paperid bigint, AP float, AP_PP float, AP_PJ_JP 			float, AP_PC_CP float, AP_PJ_JJ_JP float, AP_PC_CC_CP float)
		"""
	cursor.execute(query)
	conn.commit()
    query = """
	    COPY AP_features FROM '##path##sampleTrain.txt' DELIMITER ' '
	    """
    query = query.replace('##path##', '/home/yingzhen/Projects/KDDCUP2013/benchmark/PythonBenchmark/')
    cursor.execute(query)
    conn.commit()
    query = """
	    SELECT * FROM AP_features LIMIT 3
	    """
    cursor.execute(query)
    res = cursor.fetchall()
    return res
def main():
    conn = data_io.get_db_conn()
    cursor = conn.cursor()
    if not data_io.table_view_existence_db('AP_features', conn):
        query = """
		CREATE TABLE AP_features (
		Result int, authorid bigint, paperid bigint, AP float, AP_PP float, AP_PJ_JP 			float, AP_PC_CP float, AP_PJ_JJ_JP float, AP_PC_CC_CP float)
		"""
        cursor.execute(query)
        conn.commit()
    query = """
	    COPY AP_features FROM '##path##sampleTrain.txt' DELIMITER ' '
	    """
    query = query.replace(
        '##path##',
        '/home/yingzhen/Projects/KDDCUP2013/benchmark/PythonBenchmark/')
    cursor.execute(query)
    conn.commit()
    query = """
	    SELECT * FROM AP_features LIMIT 3
	    """
    cursor.execute(query)
    res = cursor.fetchall()
    return res