Esempio n. 1
0
def do_predict(args):

    pretrained_embeddings, token2id = util.load_word_embedding(input_file=args.vectors, cache='cache')
    stopwords = util.load_stopwords()
    stopwords = None
    train_data = util.Data(args.data_train, args.ltp_data, stopwords=stopwords)
    test_data = util.Data(args.data_test, args.ltp_data, max_length=train_data.max_length, stopwords=stopwords)
    config = Config(args)
    # 配置参数. 测试集如何设置?
    _, config.max_length = train_data.get_metadata()
    config.n_classes = len(train_data.LABELS)
    config.n_word_embed_size = len(pretrained_embeddings[0])
    config.batch_size = len(test_data.data)
    

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = Classifier(pretrained_embeddings, token2id, config)
        logger.info("took %.2f seconds", time.time() - start)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        gpu_options = tf.GPUOptions(allow_growth=True)
        with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
            
            session.run(init)
            saver.restore(session, model.config.output_model)
            labels, prediction = model.output(session, test_data, None)
            print(labels)
            print(prediction)
            
            test_data.update_labels(prediction).save_result()
Esempio n. 2
0
def do_train(args):

    pretrained_embeddings, token2id = util.load_word_embedding(input_file=args.vectors, cache='cache')
    stopwords = util.load_stopwords()
    stopwords = None
    train_data = util.Data(args.data_train, args.ltp_data, stopwords=stopwords)
    dev_data = util.Data(args.data_dev, args.ltp_data, max_length=train_data.max_length, stopwords=stopwords)
    config = Config(args)
    print(train_data.max_length)
    # 配置参数. 测试集如何设置?
    _, config.max_length = train_data.get_metadata()
    config.n_classes = len(train_data.LABELS)
    config.n_word_embed_size = len(pretrained_embeddings[0])


    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = Classifier(pretrained_embeddings, token2id, config)
        logger.info("took %.2f seconds", time.time() - start)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        gpu_options = tf.GPUOptions(allow_growth=True)
        with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
            
            session.run(init)
            score = model.fit(session, saver, train_data, dev_data) 
            print("\n")
            logger.info("training finished, took %.2f seconds with P: %.2f", time.time() - start, score)
def segment_train_data(train_data_path, saved_file):
    '''
    '''

    #filter_stopwords = lambda x: text_segment(x)
    stopwords = load_stopwords()

    def filter_stopwords(words):
        #import pdb;pdb.set_trace()
        return [r for r in words if r not in stopwords]

    print 'run segment_train_data...'
    titles = []
    f = open(train_data_path, 'r')
    line = f.readline()
    start_time = time.time()
    res = []
    cnt = 0
    newsids = set()
    round_start = time.time()
    while line:
        #line = unicode2str(line)
        parts = line.strip().split('\t')
        if parts[1].strip() in newsids:
            line = f.readline()
            continue

        newsids.add(parts[1].strip())
        cnt += 1
        parts[3] = unicode2str(' '.join(
            filter_stopwords(text_segment(parts[3], is_ret_utf8=True))))
        parts[4] = unicode2str(' '.join(
            filter_stopwords(text_segment(parts[4], is_ret_utf8=True))))
        res.append('\t\t'.join(parts))

        if cnt % 1000 == 0:
            round_cost = (time.time() - round_start)
            round_start = time.time()
            print 'segmenting %s, cost %.3fs, aver=%.3fs' % (
                cnt, round_cost, round_cost / 100.0)

        line = f.readline()

    end_time = time.time()
    total_cost = (end_time - start_time) / 60.0
    aver_cost = total_cost / float(cnt)

    print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % (
        cnt, total_cost, aver_cost)

    fw = open(data_dir + saved_file, 'w+')
    fw.write('\n'.join(res))
    fw.close()
    print 'res is saved in %s' % (saved_file)
Esempio n. 4
0
 def __init__(self, template_ids, **kwargs):
     self.template_ids = template_ids
     self.vocab = None
     self.stopwords = set([])
     if "stopwords" in kwargs:
         self.stopwords = util.load_stopwords(kwargs["stopwords"])
         print("loaded %d stopwords" % len(self.stopwords))
     self.word2vec_model = None
     if "word2vec_model" in kwargs:
         self.word2vec_model = embedding.load_embeddings(kwargs["word2vec_model"])
         print("loaded word2vec model from %s" % kwargs["word2vec_model"])
     self.embedding_dim = None
Esempio n. 5
0
 def __init__(self, template_ids, **kwargs):
     self.template_ids = template_ids
     self.vocab = None
     self.stopwords = set([])
     if "stopwords" in kwargs:
         self.stopwords = util.load_stopwords(kwargs["stopwords"])
         print("loaded %d stopwords" % len(self.stopwords))
     self.word2vec_model = None
     if "word2vec_model" in kwargs:
         self.word2vec_model = embedding.load_embeddings(
             kwargs["word2vec_model"])
         print("loaded word2vec model from %s" % kwargs["word2vec_model"])
     self.embedding_dim = None
def segment_train_data(train_data_path, saved_file):
    '''
    '''

    #filter_stopwords = lambda x: text_segment(x)
    stopwords = load_stopwords()

    def filter_stopwords(words):
        #import pdb;pdb.set_trace()
        return [r for r in words if r not in stopwords]

    print 'run segment_train_data...'
    titles = []
    f = open(train_data_path, 'r')
    line = f.readline()
    start_time = time.time()
    res = []
    cnt = 0
    newsids = set()
    round_start = time.time()
    while line:
        #line = unicode2str(line)
        parts = line.strip().split('\t')
        if parts[1].strip() in newsids:
            line = f.readline()
            continue

        newsids.add(parts[1].strip())
        cnt += 1
        parts[3] = unicode2str(' '.join(filter_stopwords(text_segment(parts[3], is_ret_utf8=True))))
        parts[4] = unicode2str(' '.join(filter_stopwords(text_segment(parts[4], is_ret_utf8=True))))
        res.append('\t\t'.join(parts))

        if cnt % 1000 == 0:
            round_cost = (time.time() - round_start)
            round_start = time.time()
            print 'segmenting %s, cost %.3fs, aver=%.3fs' % (cnt, round_cost, round_cost / 100.0 )

        line = f.readline()

    end_time = time.time()
    total_cost = (end_time - start_time) / 60.0
    aver_cost = total_cost / float(cnt)

    print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % (cnt, total_cost, aver_cost)

    fw = open(data_dir + saved_file, 'w+')
    fw.write('\n'.join(res))
    fw.close()
    print 'res is saved in %s' % (saved_file)
Esempio n. 7
0
# ('Garland, TX', ('US', 'Today i used Spark'))
tweets_city = us_tweets.map(lambda row: (row[0], row[1][2]))
# ('Garland, TX', 'Today i used Spark')

# Left outer join cities with tweets, so we only have the relevant tweets left
top_tweets = top_cities.leftOuterJoin(tweets_city)
# (city, (count, text))

# It maps each lowercase word with (city(idx0), city-count(idx1-0)) as key, and 1 as value
# Iff. the length of the word is > 2
# We keep the city-tweet-count in order to sort the results later on
words = top_tweets.flatMap(lambda row: ( ((w.lower()), (row[0],row[1][0])) for w in row[1][1].split(' ') if len(w) > 2) )
# ('#repost', ('Houston, TX', 21499))

# Load stopwords
stopwords = load_stopwords()
# ('a', None)

# Subtract the sets
words_filtered = words.subtractByKey(stopwords)
# ('clerk', ('Houston, TX', 213))

# Map to ("city-word", "city-count", "word") as key, 1 as value
words_intermediate = words_filtered.map(lambda row: ((row[1][0], row[1][1], row[0]),1))
# (('Houston, TX', 21499, 'even'), 1)

# Aggregate over the "city-word"-keys
words_counted = words_intermediate.reduceByKey(lambda a,b: a+b)
# (('Manhattan, NY', 495, 'new'), 158)

# Sort words by frequency
                        '--right-pad-symbol',
                        help='Right pad symbol at the beginning of sentences.',
                        default='</s>')
    parser.add_argument('--no-padding', dest='padding', action='store_false')
    args = parser.parse_args()

    if not args.padding:
        args.left_pad_symbol = None
        args.right_pad_symbol = None
    if args.skipgram and args.n != 2:
        print('Skipgrams only allowed for n = 2.', file=sys.stderr)
        sys.exit(1)

    print(args)
    file_paths = scan_all_files(args.root_dir)
    stopword_list = load_stopwords(args.stopwords)
    iq, oq = Queue(), Queue()
    processes = []
    total_files_num = len(file_paths)
    total_words_num = 0
    total_counter = Counter()
    num_processes = cpu_count()

    def finish():
        for p in processes:
            p.terminate()
        iq.close()
        oq.close()
        end = timer()
        print(f'Computation took {end - start} seconds in total.')
Esempio n. 9
0
from zhon import hanzi
from typing import List, Set

# Own customized modules
from global_variables import *
from util import load_stopwords

jieba.load_userdict(USERDICT_FILEPATH)

# regular expression
PUNC_REGEX = r"^[{} \s]+$".format(string.punctuation + hanzi.punctuation)
NUM_REGEX = r"^[0-9]*\.?[0-9]+$"
UNIT_REGEX = r"^([0-9]*)(mm|cm|m)?$"
ALPHANUM_REGEX = r"^[a-zA-Z0-9]+$"

stopwords = load_stopwords(STOPWORDS_FILEPATH)
vectorizer = joblib.load(VECTORIZER_FILEPATH)
model = joblib.load(MODEL_FILEPATH)


def segment(text: str, stopwords: Set, lowercase: bool = True) -> str:
    if lowercase:
        text = text.lower()
    words = []
    jieba_res = jieba.cut(text)
    for w in jieba_res:
        if len(w) <= 1:
            continue
        if w in stopwords:
            continue
        if re.match(PUNC_REGEX, w):