def generate_titles():
	log.info( "π: read text" )
	# text = [ r[6:8] for r in cu.get_reader( file_name ) ]
	text = [ r[6] for r in cu.get_reader( file_name ) ]

	log.info( "π: tokenize text" )
	text = [ nltk.word_tokenize(t) for t in text ]

	log.info( "π: stem tokens" )
	text = [ pd.Series( t ).apply( stem ) for t in text ]

	log.info( "π: to lower case" )
	text = [ pd.Series( t ).apply( lower ) for t in text ]

	log.info( "π: process text" )
	res = {}
	for st in pd.Series( open_status ).unique():
		res.setdefault( st, [] )

	for i,x in enumerate( open_status ):
		res[x].extend( text[i] )

	log.info( "π: uniquify text" )
	res = dict([ ( k, pd.Series( v ).unique() ) for k,v in res.items() ])

	log.info( "π: save vocabulary" )
	np.savez( RESOURCES_DIR + 'titles.npz', text=res )
Exemple #2
0
def build_vocab():
    i = 0
    vocab = []
    itr = get_reader("train-sample.csv")
    for entry in itr:
        title_body = string.join(entry[6:8])
        ntxt = normalize(title_body)
        if not ntxt:
            continue
        
        nwords = ntxt.split()
        zero_op = [vocab.append(nword) for nword in nwords if not nword in vocab]
        
        if len(vocab) > 5000:
            break
    
    with open("vocab.dat", "w") as vocab_writer:
        pickle.dump(vocab, vocab_writer)
def generate_tags():
	log.info( "π: read tags" )
	tags = [ r[8:13] for r in cu.get_reader( file_name ) ]
	
	log.info( "π: process tags" )
	res = {}
	for st in pd.Series( open_status ).unique():
		# res.setdefault( st, set() )
		res.setdefault( st, [] )

	for i,x in enumerate( open_status ):
		# res[x] = res[x].union( tags[i] )
		res[x].extend( tags[i] )

	res = dict([ ( k, pd.Series( v ).unique() ) for k,v in res.items() ])

	log.info( "π: save tags" )
	np.savez( RESOURCES_DIR + 'tags.npz', tags=res )
	del tags
Exemple #4
0
def get_features():
    """
    """
    rows = get_reader("train-sample.csv")
    for row in rows:
        # remove PostId
        # remove OwnerUserId
        # create a feature called owner_age_at_post_creation = (PostCreationDate - OwnerCreationDate)
        # keep ReputationAtPostCreation
        # keep OwnerUndeletedAnswerCountAtPostTime
        # remove Tags
        # remove PostClosedDate
        
        post_creation_date = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        owner_creation_date = datetime.strptime(row[3], "%m/%d/%Y %H:%M:%S")
        owner_age_now = (post_creation_date - owner_creation_date).days
        
        latest_reputation = row[4]
        undeleted_answer = row[5]
        title_body = string.join(row[6:8])
        
        ntxt = normalize(title_body)
        if not ntxt:
            continue
        
        vocab = read_pickle_file()
        txt_feat = [0 for i in range(5000)]        
        nwords = ntxt.split()
        
        for nword in nwords:
            if nword in vocab:
                txt_feat[vocab.index(nword)] = 1
        
        txt_feat.extend([owner_age_now, int(latest_reputation),
                                int(undeleted_answer)])
        print txt_feat
        
        exit()
Exemple #5
0
def main():
    predictions = [actual_lookup[r[14]] for r in cu.get_reader(actual_file)]
    cu.write_submission("actual_benchmark.csv", predictions)
def main():
    predictions = [actual_lookup[r[14]] for r in cu.get_reader(actual_file)]
    cu.write_submission("actual_benchmark.csv", predictions)
import nltk
import nltk.stem.snowball as snowball

import competition_utilities as cu

DATA_DIR = '../data/'
RESOURCES_DIR = './resources/'
file_name = 'train.csv'

logging.basicConfig( level=logging.INFO,
					format='%(asctime)s %(levelname)s %(message)s' )
log = logging.getLogger(__name__)

log.info( "π: read data" )
header = cu.get_header( file_name )
open_status = [ r[14] for r in cu.get_reader( file_name ) ]

def generate_tags():
	log.info( "π: read tags" )
	tags = [ r[8:13] for r in cu.get_reader( file_name ) ]
	
	log.info( "π: process tags" )
	res = {}
	for st in pd.Series( open_status ).unique():
		# res.setdefault( st, set() )
		res.setdefault( st, [] )

	for i,x in enumerate( open_status ):
		# res[x] = res[x].union( tags[i] )
		res[x].extend( tags[i] )