Exemple #1
0
def main():
    parser = argparse.ArgumentParser(description="take text feature")
    parser.add_argument("-t", "--type", type=str, choices=("db", "file"), default="file", help="db/file")
    parser.add_argument("-s", "--source", type=str, help="file path/sql script")
    parser.add_argument("-n", "--name", type=str, help="output file name")
    parser.add_argument("-k", "--topk", type=int, default=500, help="top k words")
    parser.add_argument("-w", "--word_category", default="v,vd,vn,vf,a,ad,an,ag,al", type=str, help="word category")
    args = parser.parse_args()

    source_from = args.type
    source = args.source
    name = args.name
    k_num = args.topk
    word_category = args.word_category.split(",")
    print word_category
    if source_from == "db":
        comments_df = preprocess.get_data_from_db(source)
    elif source_from == "file":
        comments_df = preprocess.read_comment_from_file(source)
    else:
        return

    comments_list = list(comments_df["comment"].values)
    cutted, word_category_list = utils.word_cut(comments_list)
    word_weight_flag = utils.tfidf(cutted, word_category_list, "tfidf_" + name)
    key_word = utils.get_topK(word_weight_flag, "top_k_" + name, k=k_num, category_list=word_category)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(description='take text feature')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        choices=('db', 'file'),
                        default='file',
                        help='db/file')
    parser.add_argument('-s',
                        '--source',
                        type=str,
                        help='file path/sql script')
    parser.add_argument('-n', '--name', type=str, help='output file name')
    parser.add_argument('-k',
                        '--topk',
                        type=int,
                        default=500,
                        help='top k words')
    parser.add_argument('-w',
                        '--word_category',
                        default='v,vd,vn,vf,a,ad,an,ag,al',
                        type=str,
                        help='word category')
    args = parser.parse_args()

    source_from = args.type
    source = args.source
    name = args.name
    k_num = args.topk
    word_category = args.word_category.split(',')
    print word_category
    if source_from == 'db':
        comments_df = preprocess.get_data_from_db(source)
    elif source_from == 'file':
        comments_df = preprocess.read_comment_from_file(source)
    else:
        return

    comments_list = list(comments_df['comment'].values)
    cutted, word_category_list = utils.word_cut(comments_list)
    word_weight_flag = utils.tfidf(cutted, word_category_list, 'tfidf_' + name)
    key_word = utils.get_topK(word_weight_flag,
                              'top_k_' + name,
                              k=k_num,
                              category_list=word_category)
Exemple #3
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

__author__ = 'Administrator'
import sys
import numpy as np
sys.path.append('./lib')

import utils
import preprocess
import match_comments as mc
import string

comments_df = preprocess.read_comment_from_file('data/order_review')
comments = comments_df['comment'].iloc[:]
keys = utils.read_in_keys("top_k_'test'")
result = comments.apply(mc.tag_comments, args=(keys, ))
result.to_csv('comment_with_tag', sep='\t', encoding='utf-8')
bag_of_tags = set()
for line in result:
    tmp = line.split('>>')[-1]
    tags = tmp.split('\t')
    tags = map(string.strip, tags)
    bag_of_tags = bag_of_tags.union(set(tags))
ff = open('tags', 'w')
for t in bag_of_tags:
    ff.write(t + '\n')
ff.close()
Exemple #4
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

__author__ = 'Administrator'
import sys
import numpy as np
sys.path.append('./lib')

import utils
import preprocess
import match_comments as mc
import string


comments_df = preprocess.read_comment_from_file('data/order_review')
comments = comments_df['comment'].iloc[:]
keys = utils.read_in_keys("top_k_'test'")
result = comments.apply(mc.tag_comments,args = (keys,))
result.to_csv('comment_with_tag', sep='\t', encoding='utf-8')
bag_of_tags = set()
for line in result:
	tmp = line.split('>>')[-1]
	tags = tmp.split('\t')
	tags = map(string.strip,tags)
	bag_of_tags = bag_of_tags.union(set(tags))
ff = open('tags','w')
for t in bag_of_tags:
	ff.write(t +'\n')
ff.close()