def _normalize_timestr(timestr): if isinstance(timestr, (int, long, float)): return timestr timestr = normalize(timestr) for item in 'milliseconds', 'millisecond', 'millis': timestr = timestr.replace(item, 'ms') for item in 'seconds', 'second', 'secs', 'sec': timestr = timestr.replace(item, 's') for item in 'minutes', 'minute', 'mins', 'min': timestr = timestr.replace(item, 'm') for item in 'hours', 'hour': timestr = timestr.replace(item, 'h') for item in 'days', 'day': timestr = timestr.replace(item, 'd') # 1) 'ms' -> 'x' to ease processing later # 2) 'd' -> 'p' because float('1d') returns 1.0 in Jython (bug submitted) return timestr.replace('ms','x').replace('d','p')
def build_vocab(): i = 0 vocab = [] itr = get_reader("train-sample.csv") for entry in itr: title_body = string.join(entry[6:8]) ntxt = normalize(title_body) if not ntxt: continue nwords = ntxt.split() zero_op = [vocab.append(nword) for nword in nwords if not nword in vocab] if len(vocab) > 5000: break with open("vocab.dat", "w") as vocab_writer: pickle.dump(vocab, vocab_writer)
def get_features(): """ """ rows = get_reader("train-sample.csv") for row in rows: # remove PostId # remove OwnerUserId # create a feature called owner_age_at_post_creation = (PostCreationDate - OwnerCreationDate) # keep ReputationAtPostCreation # keep OwnerUndeletedAnswerCountAtPostTime # remove Tags # remove PostClosedDate post_creation_date = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S") owner_creation_date = datetime.strptime(row[3], "%m/%d/%Y %H:%M:%S") owner_age_now = (post_creation_date - owner_creation_date).days latest_reputation = row[4] undeleted_answer = row[5] title_body = string.join(row[6:8]) ntxt = normalize(title_body) if not ntxt: continue vocab = read_pickle_file() txt_feat = [0 for i in range(5000)] nwords = ntxt.split() for nword in nwords: if nword in vocab: txt_feat[vocab.index(nword)] = 1 txt_feat.extend([owner_age_now, int(latest_reputation), int(undeleted_answer)]) print txt_feat exit()
def matches(string, pattern, ignore=[], caseless=True, spaceless=True): string = normalize(string, ignore, caseless, spaceless) pattern = normalize(pattern, ignore, caseless, spaceless) regexp = _get_match_regexp(pattern) return re.match(regexp, string, re.DOTALL) is not None
def eq_any(str_, str_list, ignore=[], caseless=True, spaceless=True): str_ = normalize(str_, ignore, caseless, spaceless) for s in str_list: if str_ == normalize(s, ignore, caseless, spaceless): return True return False
def eq(str1, str2, ignore=[], caseless=True, spaceless=True): str1 = normalize(str1, ignore, caseless, spaceless) str2 = normalize(str2, ignore, caseless, spaceless) return str1 == str2