def generate_titles(): log.info( "π: read text" ) # text = [ r[6:8] for r in cu.get_reader( file_name ) ] text = [ r[6] for r in cu.get_reader( file_name ) ] log.info( "π: tokenize text" ) text = [ nltk.word_tokenize(t) for t in text ] log.info( "π: stem tokens" ) text = [ pd.Series( t ).apply( stem ) for t in text ] log.info( "π: to lower case" ) text = [ pd.Series( t ).apply( lower ) for t in text ] log.info( "π: process text" ) res = {} for st in pd.Series( open_status ).unique(): res.setdefault( st, [] ) for i,x in enumerate( open_status ): res[x].extend( text[i] ) log.info( "π: uniquify text" ) res = dict([ ( k, pd.Series( v ).unique() ) for k,v in res.items() ]) log.info( "π: save vocabulary" ) np.savez( RESOURCES_DIR + 'titles.npz', text=res )
def build_vocab(): i = 0 vocab = [] itr = get_reader("train-sample.csv") for entry in itr: title_body = string.join(entry[6:8]) ntxt = normalize(title_body) if not ntxt: continue nwords = ntxt.split() zero_op = [vocab.append(nword) for nword in nwords if not nword in vocab] if len(vocab) > 5000: break with open("vocab.dat", "w") as vocab_writer: pickle.dump(vocab, vocab_writer)
def generate_tags(): log.info( "π: read tags" ) tags = [ r[8:13] for r in cu.get_reader( file_name ) ] log.info( "π: process tags" ) res = {} for st in pd.Series( open_status ).unique(): # res.setdefault( st, set() ) res.setdefault( st, [] ) for i,x in enumerate( open_status ): # res[x] = res[x].union( tags[i] ) res[x].extend( tags[i] ) res = dict([ ( k, pd.Series( v ).unique() ) for k,v in res.items() ]) log.info( "π: save tags" ) np.savez( RESOURCES_DIR + 'tags.npz', tags=res ) del tags
def get_features(): """ """ rows = get_reader("train-sample.csv") for row in rows: # remove PostId # remove OwnerUserId # create a feature called owner_age_at_post_creation = (PostCreationDate - OwnerCreationDate) # keep ReputationAtPostCreation # keep OwnerUndeletedAnswerCountAtPostTime # remove Tags # remove PostClosedDate post_creation_date = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S") owner_creation_date = datetime.strptime(row[3], "%m/%d/%Y %H:%M:%S") owner_age_now = (post_creation_date - owner_creation_date).days latest_reputation = row[4] undeleted_answer = row[5] title_body = string.join(row[6:8]) ntxt = normalize(title_body) if not ntxt: continue vocab = read_pickle_file() txt_feat = [0 for i in range(5000)] nwords = ntxt.split() for nword in nwords: if nword in vocab: txt_feat[vocab.index(nword)] = 1 txt_feat.extend([owner_age_now, int(latest_reputation), int(undeleted_answer)]) print txt_feat exit()
def main(): predictions = [actual_lookup[r[14]] for r in cu.get_reader(actual_file)] cu.write_submission("actual_benchmark.csv", predictions)
import nltk import nltk.stem.snowball as snowball import competition_utilities as cu DATA_DIR = '../data/' RESOURCES_DIR = './resources/' file_name = 'train.csv' logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s' ) log = logging.getLogger(__name__) log.info( "π: read data" ) header = cu.get_header( file_name ) open_status = [ r[14] for r in cu.get_reader( file_name ) ] def generate_tags(): log.info( "π: read tags" ) tags = [ r[8:13] for r in cu.get_reader( file_name ) ] log.info( "π: process tags" ) res = {} for st in pd.Series( open_status ).unique(): # res.setdefault( st, set() ) res.setdefault( st, [] ) for i,x in enumerate( open_status ): # res[x] = res[x].union( tags[i] ) res[x].extend( tags[i] )