Example #1
0
def _normalize_timestr(timestr):
    if isinstance(timestr, (int, long, float)):
        return timestr
    timestr = normalize(timestr)
    for item in 'milliseconds', 'millisecond', 'millis':
        timestr = timestr.replace(item, 'ms')
    for item in 'seconds', 'second', 'secs', 'sec':
        timestr = timestr.replace(item, 's')
    for item in 'minutes', 'minute', 'mins', 'min':
        timestr = timestr.replace(item, 'm')
    for item in 'hours', 'hour':
        timestr = timestr.replace(item, 'h')
    for item in 'days', 'day':
        timestr = timestr.replace(item, 'd')
    # 1) 'ms' -> 'x' to ease processing later
    # 2) 'd' -> 'p' because float('1d') returns 1.0 in Jython (bug submitted)
    return timestr.replace('ms','x').replace('d','p')
Example #2
0
def _normalize_timestr(timestr):
    if isinstance(timestr, (int, long, float)):
        return timestr
    timestr = normalize(timestr)
    for item in 'milliseconds', 'millisecond', 'millis':
        timestr = timestr.replace(item, 'ms')
    for item in 'seconds', 'second', 'secs', 'sec':
        timestr = timestr.replace(item, 's')
    for item in 'minutes', 'minute', 'mins', 'min':
        timestr = timestr.replace(item, 'm')
    for item in 'hours', 'hour':
        timestr = timestr.replace(item, 'h')
    for item in 'days', 'day':
        timestr = timestr.replace(item, 'd')
    # 1) 'ms' -> 'x' to ease processing later
    # 2) 'd' -> 'p' because float('1d') returns 1.0 in Jython (bug submitted)
    return timestr.replace('ms','x').replace('d','p')
Example #3
0
def build_vocab():
    i = 0
    vocab = []
    itr = get_reader("train-sample.csv")
    for entry in itr:
        title_body = string.join(entry[6:8])
        ntxt = normalize(title_body)
        if not ntxt:
            continue
        
        nwords = ntxt.split()
        zero_op = [vocab.append(nword) for nword in nwords if not nword in vocab]
        
        if len(vocab) > 5000:
            break
    
    with open("vocab.dat", "w") as vocab_writer:
        pickle.dump(vocab, vocab_writer)
Example #4
0
def get_features():
    """
    """
    rows = get_reader("train-sample.csv")
    for row in rows:
        # remove PostId
        # remove OwnerUserId
        # create a feature called owner_age_at_post_creation = (PostCreationDate - OwnerCreationDate)
        # keep ReputationAtPostCreation
        # keep OwnerUndeletedAnswerCountAtPostTime
        # remove Tags
        # remove PostClosedDate
        
        post_creation_date = datetime.strptime(row[1], "%m/%d/%Y %H:%M:%S")
        owner_creation_date = datetime.strptime(row[3], "%m/%d/%Y %H:%M:%S")
        owner_age_now = (post_creation_date - owner_creation_date).days
        
        latest_reputation = row[4]
        undeleted_answer = row[5]
        title_body = string.join(row[6:8])
        
        ntxt = normalize(title_body)
        if not ntxt:
            continue
        
        vocab = read_pickle_file()
        txt_feat = [0 for i in range(5000)]        
        nwords = ntxt.split()
        
        for nword in nwords:
            if nword in vocab:
                txt_feat[vocab.index(nword)] = 1
        
        txt_feat.extend([owner_age_now, int(latest_reputation),
                                int(undeleted_answer)])
        print txt_feat
        
        exit()
Example #5
0
def matches(string, pattern, ignore=[], caseless=True, spaceless=True):
    string = normalize(string, ignore, caseless, spaceless)
    pattern = normalize(pattern, ignore, caseless, spaceless)
    regexp = _get_match_regexp(pattern)
    return re.match(regexp, string, re.DOTALL) is not None
Example #6
0
def eq_any(str_, str_list, ignore=[], caseless=True, spaceless=True):
    str_ = normalize(str_, ignore, caseless, spaceless)
    for s in str_list:
        if str_ == normalize(s, ignore, caseless, spaceless):
            return True
    return False
Example #7
0
def eq(str1, str2, ignore=[], caseless=True, spaceless=True):
    str1 = normalize(str1, ignore, caseless, spaceless)
    str2 = normalize(str2, ignore, caseless, spaceless)
    return str1 == str2
Example #8
0
def eq(str1, str2, ignore=[], caseless=True, spaceless=True):
    str1 = normalize(str1, ignore, caseless, spaceless)
    str2 = normalize(str2, ignore, caseless, spaceless)
    return str1 == str2
Example #9
0
def matches(string, pattern, ignore=[], caseless=True, spaceless=True):
    string = normalize(string, ignore, caseless, spaceless)
    pattern = normalize(pattern, ignore, caseless, spaceless)
    regexp = _get_match_regexp(pattern)
    return re.match(regexp, string, re.DOTALL) is not None
Example #10
0
def eq_any(str_, str_list, ignore=[], caseless=True, spaceless=True):
    str_ = normalize(str_, ignore, caseless, spaceless)
    for s in str_list:
        if str_ == normalize(s, ignore, caseless, spaceless):
            return True
    return False