def sample_train(input_file):
    closed_count = cu.get_closed_count(input_file)
    sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count)
    sample.extend(cu.iter_closed_questions(input_file))
    random.shuffle(sample)
    header = cu.get_header(input_file)
    return header, sample
예제 #2
0
def sample_train(input_file):
    closed_count = cu.get_closed_count(input_file)
    sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count)
    sample.extend(cu.iter_closed_questions(input_file))
    random.shuffle(sample)
    header = cu.get_header(input_file)
    return header, sample
def main():
    data = cu.get_dataframe("train.csv")
    data = data.sort_index(by="PostCreationDate")

    header = cu.get_header("train.csv")
    cutoff = datetime.datetime(2012, 7, 18)

    data[data["PostCreationDate"] < cutoff].to_csv(os.path.join(cu.data_path, "train-A.csv"), index=False)
    data[data["PostCreationDate"] >= cutoff].to_csv(os.path.join(cu.data_path, "train-B.csv"), index=False)
예제 #4
0
def main():
    data = cu.get_dataframe("train.csv")
    data = data.sort_index(by="PostCreationDate")

    header = cu.get_header("train.csv")
    cutoff = datetime.datetime(2012, 7, 18)

    data[data["PostCreationDate"] < cutoff].to_csv(os.path.join(cu.data_path, "train-A.csv"), index=False)
    data[data["PostCreationDate"] >= cutoff].to_csv(os.path.join(cu.data_path, "train-B.csv"), index=False)
def sample_train(input_file):
    print("get closed question count")
    closed_count = cu.get_closed_count(input_file)
    print("sample open questions")
    sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count)
    print("get all closed questions")
    sample.extend(cu.iter_closed_questions(input_file))
    print("shuffle all the data")
    random.shuffle(sample)
    header = cu.get_header(input_file)
    return header, sample
def sample_train(input_file):
    print("get closed question count")
    closed_count = cu.get_closed_count(input_file)
    print("sample open questions")
    sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count)
    print("get all closed questions")
    sample.extend(cu.iter_closed_questions(input_file))
    print("shuffle all the data")
    random.shuffle(sample)
    header = cu.get_header(input_file)
    return header, sample
예제 #7
0
def main():
    print "get data"
    data = cu.get_dataframe("train.csv")
    print "sort by creation date"
    data = data.sort_index(by="PostCreationDate")
    print "cut off"
    header = cu.get_header("train.csv")
    splits = np.array_split(data, 3)
    frames = [splits[0], splits[1]]
    train_data = pd.concat(frames)
    test_data = splits[2]
    # cutoff = datetime.datetime(2012, 7, 18)
    print "write to csv"
    cu.write_sample("train_data.csv", header, train_data)
    train_data.to_csv(os.path.join(cu.data_path, "train_data.csv"), index=False, header=header)
    test_data.to_csv(os.path.join(cu.data_path, "test_data.csv"), index=False, header=header)
def main():
    print("Reading the data", train_file)
    header = cu.get_header(train_file)

    records = cu.get_lines(train_file, lines)
    cu.write_sample(output_file, header, records)
				values.append(q[field])
			else:
				values.append("''")
		writer.writerow(values)
		i = i + 1
	print "written out total for this class: " + str(i)
	return i
		
if __name__=="__main__":

	start = time.time()
	
	filename_in = train_file
	filename_out = os.path.join(main_path, "data", output_sampled_file)
	
	writer = csv.writer(open(filename_out, "w"), lineterminator="\n")
	writer.writerow(cu.get_header(filename_in))
	
	total_written = 0
	if output_all_entries == 0:
		per_class_limit = cu.output_rows_limit / len(question_status)
	else:
		per_class_limit = -1
	for status in question_status:
		total_written = total_written + sample_by_class(writer,status,per_class_limit)

	print "total rows written:" + str(total_written)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)

	
예제 #10
0
import nltk
import nltk.stem.snowball as snowball

import competition_utilities as cu

DATA_DIR = '../data/'
RESOURCES_DIR = './resources/'
file_name = 'train.csv'

logging.basicConfig( level=logging.INFO,
					format='%(asctime)s %(levelname)s %(message)s' )
log = logging.getLogger(__name__)

log.info( "π: read data" )
header = cu.get_header( file_name )
open_status = [ r[14] for r in cu.get_reader( file_name ) ]

def generate_tags():
	log.info( "π: read tags" )
	tags = [ r[8:13] for r in cu.get_reader( file_name ) ]
	
	log.info( "π: process tags" )
	res = {}
	for st in pd.Series( open_status ).unique():
		# res.setdefault( st, set() )
		res.setdefault( st, [] )

	for i,x in enumerate( open_status ):
		# res[x] = res[x].union( tags[i] )
		res[x].extend( tags[i] )
            else:
                values.append("''")
        writer.writerow(values)
        i = i + 1
    print "written out total for this class: " + str(i)
    return i


if __name__ == "__main__":

    start = time.time()

    filename_in = train_file
    filename_out = os.path.join(main_path, "data", output_sampled_file)

    writer = csv.writer(open(filename_out, "w"), lineterminator="\n")
    writer.writerow(cu.get_header(filename_in))

    total_written = 0
    if output_all_entries == 0:
        per_class_limit = cu.output_rows_limit / len(question_status)
    else:
        per_class_limit = -1
    for status in question_status:
        total_written = total_written + sample_by_class(
            writer, status, per_class_limit)

    print "total rows written:" + str(total_written)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)