Exemple #1
0
def splitWord(line):
	vran = line[0]
	title = line[1]
	words = ttp.clean_title(title).split(" ")
	words = filter(None, words)
	for i in range(len(words)):
		words[i] = vran + "-" + words[i]
	return words
Exemple #2
0
def parseTitle(line):
	vran = line[0]
	title = line[1]
	words = ttp.clean_title(title).split(" ")
	words = [word for word in words if is_katakana(word)]
	for i in range(len(words)):
		words[i] = vran + "-" + words[i]
	return words
def parseTitle(line):
    vran = line[0]
    title = unicode(line[1], 'utf-8')
    words = ttp.clean_title(title).split(" ")
    words = [word for word in words if word and is_katakana(word)]
    for i in range(len(words)):
        words[i] = words[i] + "|" + title
    return words
Exemple #4
0
def splitWord(line):
    vran = line[0]
    title = line[1]
    words = ttp.clean_title(title).split(" ")
    words = filter(None, words)
    for i in range(len(words)):
        words[i] = vran + "-" + words[i]
    return words
def parseTitle(line):
	vran = line[0]
	title = line[1]
	words = ttp.clean_title(title).split(" ")
	words = [word for word in words if is_katakana(word) and word]
	for i in range(len(words)):
		words[i] = vran + "-" + words[i]
	return words
Exemple #6
0
def parseTitle(line):
    ans = []
    vran = line[0]
    title = unicode(line[1], 'utf-8')
    words = ttp.clean_title(title).split(" ")
    words = [word for word in words if word and is_katakana(word)]
    for i in range(len(words)):
        ans.append((words[i], title))
    return ans
def parseTitle(line):
	ans = []
	vran = line[0]
	title = unicode(line[1], 'utf-8')
	words = ttp.clean_title(title).split(" ")
	words = [word for word in words if word and is_katakana(word)]
	for i in range(len(words)):
		ans.append((words[i], title))
	return ans
def preprocess(title, srm=NLPSpamRemover()):
    tokenized_title = ttp.clean_title(title)

    tokens = []
    for t in tokenized_title.split(' '):
        adj = t.replace(' ', '').replace(u'\u3000', '')
        if len(adj) < 4 and not adj.isalnum():
            continue
        if u'\u5186' in t:  #'円'
            continue
        #print len(adj), t, len(t), adj.isalnum()
        tokens.append(adj)

    clean_tokens = srm.remove_spam2(tokens)

    return tokenized_title, clean_tokens
def preprocess(title, srm=NLPSpamRemover()):
    tokenized_title = ttp.clean_title(title)
    
    tokens = []
    for t in tokenized_title.split(' '):
        adj = t.replace(' ', '').replace(u'\u3000', '')
        if len(adj) < 4 and not adj.isalnum():
            continue
        if u'\u5186' in t:  #'円'
            continue
        #print len(adj), t, len(t), adj.isalnum()
        tokens.append(adj)

    clean_tokens = srm.remove_spam2(tokens)

    return tokenized_title, clean_tokens
Exemple #10
0
def main(spam_remover, logger):
    import TitlePreprocessing as ttp
    import time

    logger.debug('#keys:' + str(len(spam_remover.spam_map)))
    logger.debug('key1:' + str(type(spam_remover.spam_map.keys()[0])) + ' ' +
                 str(spam_remover.spam_map.keys()[0]))
    logger.debug('val1:' +
                 str(spam_remover.spam_map[spam_remover.spam_map.keys()[0]]))
    # TODO move these outputs ito each spam removers class and change it according to datatype
    # nlp_spam map is hash->dict and entropy_spam is hash->set
    try:
        for k, vals in spam_remover.spam_map[spam_remover.spam_map.keys()
                                             [0]].iteritems():
            logger.debug('elem k: ' + str(type(k)) + ' ' + str(k))
            for v in vals:
                logger.debug('elem v:' + str(type(v)) + ' ' + str(v))
    except:
        pass
    stime = time.time()

    # test1
    logger.debug('')
    logger.debug('test1 --------------------------------')
    title = '【エントリーでポイント10倍】セクシーワンピ/ワンピース/レディースファッション/海外人気モデル【10500円以上で送料無料】'
    logger.debug('title : ' + str(type(title)) + ' ' + title)
    tkns = ttp.clean_title(title).split(' ')
    ptitle1 = ' '.join(tkns)

    ret = spam_remover.remove_spam(tkns)
    ptitle2 = ' '.join(ret)
    if ptitle1 != ptitle2:
        logger.debug('in :' + ptitle1.encode('utf8'))
        logger.debug('out:' + ptitle2.encode('utf8'))
    #"""

    #"""
    # test2
    if len(sys.argv) > 1:
        logger.debug('')
        logger.debug('test2 --------------------------------')
        # removeSpam_test('507745')
        removeSpam_benchmark(spam_remover, sys.argv[1], logger)
Exemple #11
0
def main(spam_remover, logger):
    import TitlePreprocessing as ttp
    import time

    logger.debug('#keys:' + str(len(spam_remover.spam_map)))
    logger.debug('key1:' + str(type(spam_remover.spam_map.keys()
                                    [0])) + ' ' + str(spam_remover.spam_map.keys()[0]))
    logger.debug(
        'val1:' + str(spam_remover.spam_map[spam_remover.spam_map.keys()[0]]))
    # TODO move these outputs ito each spam removers class and change it according to datatype
    # nlp_spam map is hash->dict and entropy_spam is hash->set
    try:
        for k, vals in spam_remover.spam_map[spam_remover.spam_map.keys()[0]].iteritems():
            logger.debug('elem k: ' + str(type(k)) + ' ' + str(k))
            for v in vals:
                logger.debug('elem v:' + str(type(v)) + ' ' + str(v))
    except:
        pass
    stime = time.time()

    # test1
    logger.debug('')
    logger.debug('test1 --------------------------------')
    title = '【エントリーでポイント10倍】セクシーワンピ/ワンピース/レディースファッション/海外人気モデル【10500円以上で送料無料】'
    logger.debug('title : ' + str(type(title)) + ' ' + title)
    tkns = ttp.clean_title(title).split(' ')
    ptitle1 = ' '.join(tkns)

    ret = spam_remover.remove_spam(tkns)
    ptitle2 = ' '.join(ret)
    if ptitle1 != ptitle2:
        logger.debug('in :' + ptitle1.encode('utf8'))
        logger.debug('out:' + ptitle2.encode('utf8'))
    #"""

    #"""
    # test2
    if len(sys.argv) > 1:
        logger.debug('')
        logger.debug('test2 --------------------------------')
        # removeSpam_test('507745')
        removeSpam_benchmark(spam_remover, sys.argv[1], logger)
Exemple #12
0
def removeSpam_benchmark(spam_remover, genreID, logger):
    import CassConn.CassConn as CC
    import TitlePreprocessing as ttp
    stime = time.time()

    cc = CC.CassConn(env='INS')

    batchSize = 500
    rowKey = "G_" + str(genreID)
    items = [x[0] for x in cc.cfProductMaster.xget(rowKey)]

    logger.debug(' '.join(
        ['genre ', genreID, ' received ',
         str(len(items)), ' from G_ index']))
    if len(items) > 60000:
        print 'skip big genre'
        return

    for indx in range(0, len(items), batchSize):
        pdata = cc.cfProductMaster.multiget(items[indx:indx + batchSize],
                                            columns=['V1'])
        for prodKey, pvals in pdata.iteritems():
            try:
                parts = pvals['V1'].split('\t')
                title = parts[8]

                # tokenize and remove spam
                tkns = ttp.clean_title(title).split(' ')  # .decode('utf8'))

                ptitle1 = ' '.join(tkns)
                ret = spam_remover.remove_spam(tkns)
                ptitle2 = ' '.join(ret)

                #logger.debug(' '.join([prodKey,'in :',ptitle1.encode('utf8')]))
                #logger.debug(' '.join([prodKey,'out:',ptitle2.encode('utf8')]))
                #logger.debug(' -------------------------- ')
            except Exception, e:
                print traceback.format_exc()
                sys.exit()
                continue
Exemple #13
0
def removeSpam_benchmark(spam_remover, genreID, logger):
    import CassConn.CassConn as CC
    import TitlePreprocessing as ttp
    stime = time.time()

    cc = CC.CassConn(env='INS')

    batchSize = 500
    rowKey = "G_" + str(genreID)
    items = [x[0] for x in cc.cfProductMaster.xget(rowKey)]

    logger.debug(
        ' '.join(['genre ', genreID, ' received ', str(len(items)), ' from G_ index']))
    if len(items) > 60000:
        print 'skip big genre'
        return

    for indx in range(0, len(items), batchSize):
        pdata = cc.cfProductMaster.multiget(
            items[indx:indx + batchSize], columns=['V1'])
        for prodKey, pvals in pdata.iteritems():
            try:
                parts = pvals['V1'].split('\t')
                title = parts[8]

                # tokenize and remove spam
                tkns = ttp.clean_title(title).split(' ')  # .decode('utf8'))

                ptitle1 = ' '.join(tkns)
                ret = spam_remover.remove_spam(tkns)
                ptitle2 = ' '.join(ret)

                #logger.debug(' '.join([prodKey,'in :',ptitle1.encode('utf8')]))
                #logger.debug(' '.join([prodKey,'out:',ptitle2.encode('utf8')]))
                #logger.debug(' -------------------------- ')
            except Exception, e:
                print traceback.format_exc()
                sys.exit()
                continue
def containsAlphabets(line):
	title = ttp.clean_title(line)
	if re.search('[a-zA-Z]' ,title):
		return title
	else:
		return 0
Exemple #15
0
def splitWord(title):
	words = ttp.clean_title(title).split(' ')
	words = [word for word in words if not word in ['', '[', ']']]
	return words
Exemple #16
0
def splitWord(title):
	words = ttp.clean_title(title).split(' ')
	words = filter(None, words)
	return words
Exemple #17
0
def containsAlphabets(line):
    title = ttp.clean_title(line)
    if re.search('[a-zA-Z]', title):
        return title
    else:
        return 0
Exemple #18
0
def splitWord(title):
    words = ttp.clean_title(title).split(' ')
    words = filter(None, words)
    return words