def do_predict(args): pretrained_embeddings, token2id = util.load_word_embedding(input_file=args.vectors, cache='cache') stopwords = util.load_stopwords() stopwords = None train_data = util.Data(args.data_train, args.ltp_data, stopwords=stopwords) test_data = util.Data(args.data_test, args.ltp_data, max_length=train_data.max_length, stopwords=stopwords) config = Config(args) # 配置参数. 测试集如何设置? _, config.max_length = train_data.get_metadata() config.n_classes = len(train_data.LABELS) config.n_word_embed_size = len(pretrained_embeddings[0]) config.batch_size = len(test_data.data) with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = Classifier(pretrained_embeddings, token2id, config) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session: session.run(init) saver.restore(session, model.config.output_model) labels, prediction = model.output(session, test_data, None) print(labels) print(prediction) test_data.update_labels(prediction).save_result()
def do_train(args): pretrained_embeddings, token2id = util.load_word_embedding(input_file=args.vectors, cache='cache') stopwords = util.load_stopwords() stopwords = None train_data = util.Data(args.data_train, args.ltp_data, stopwords=stopwords) dev_data = util.Data(args.data_dev, args.ltp_data, max_length=train_data.max_length, stopwords=stopwords) config = Config(args) print(train_data.max_length) # 配置参数. 测试集如何设置? _, config.max_length = train_data.get_metadata() config.n_classes = len(train_data.LABELS) config.n_word_embed_size = len(pretrained_embeddings[0]) with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = Classifier(pretrained_embeddings, token2id, config) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session: session.run(init) score = model.fit(session, saver, train_data, dev_data) print("\n") logger.info("training finished, took %.2f seconds with P: %.2f", time.time() - start, score)
def segment_train_data(train_data_path, saved_file): ''' ''' #filter_stopwords = lambda x: text_segment(x) stopwords = load_stopwords() def filter_stopwords(words): #import pdb;pdb.set_trace() return [r for r in words if r not in stopwords] print 'run segment_train_data...' titles = [] f = open(train_data_path, 'r') line = f.readline() start_time = time.time() res = [] cnt = 0 newsids = set() round_start = time.time() while line: #line = unicode2str(line) parts = line.strip().split('\t') if parts[1].strip() in newsids: line = f.readline() continue newsids.add(parts[1].strip()) cnt += 1 parts[3] = unicode2str(' '.join( filter_stopwords(text_segment(parts[3], is_ret_utf8=True)))) parts[4] = unicode2str(' '.join( filter_stopwords(text_segment(parts[4], is_ret_utf8=True)))) res.append('\t\t'.join(parts)) if cnt % 1000 == 0: round_cost = (time.time() - round_start) round_start = time.time() print 'segmenting %s, cost %.3fs, aver=%.3fs' % ( cnt, round_cost, round_cost / 100.0) line = f.readline() end_time = time.time() total_cost = (end_time - start_time) / 60.0 aver_cost = total_cost / float(cnt) print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % ( cnt, total_cost, aver_cost) fw = open(data_dir + saved_file, 'w+') fw.write('\n'.join(res)) fw.close() print 'res is saved in %s' % (saved_file)
def __init__(self, template_ids, **kwargs): self.template_ids = template_ids self.vocab = None self.stopwords = set([]) if "stopwords" in kwargs: self.stopwords = util.load_stopwords(kwargs["stopwords"]) print("loaded %d stopwords" % len(self.stopwords)) self.word2vec_model = None if "word2vec_model" in kwargs: self.word2vec_model = embedding.load_embeddings(kwargs["word2vec_model"]) print("loaded word2vec model from %s" % kwargs["word2vec_model"]) self.embedding_dim = None
def __init__(self, template_ids, **kwargs): self.template_ids = template_ids self.vocab = None self.stopwords = set([]) if "stopwords" in kwargs: self.stopwords = util.load_stopwords(kwargs["stopwords"]) print("loaded %d stopwords" % len(self.stopwords)) self.word2vec_model = None if "word2vec_model" in kwargs: self.word2vec_model = embedding.load_embeddings( kwargs["word2vec_model"]) print("loaded word2vec model from %s" % kwargs["word2vec_model"]) self.embedding_dim = None
def segment_train_data(train_data_path, saved_file): ''' ''' #filter_stopwords = lambda x: text_segment(x) stopwords = load_stopwords() def filter_stopwords(words): #import pdb;pdb.set_trace() return [r for r in words if r not in stopwords] print 'run segment_train_data...' titles = [] f = open(train_data_path, 'r') line = f.readline() start_time = time.time() res = [] cnt = 0 newsids = set() round_start = time.time() while line: #line = unicode2str(line) parts = line.strip().split('\t') if parts[1].strip() in newsids: line = f.readline() continue newsids.add(parts[1].strip()) cnt += 1 parts[3] = unicode2str(' '.join(filter_stopwords(text_segment(parts[3], is_ret_utf8=True)))) parts[4] = unicode2str(' '.join(filter_stopwords(text_segment(parts[4], is_ret_utf8=True)))) res.append('\t\t'.join(parts)) if cnt % 1000 == 0: round_cost = (time.time() - round_start) round_start = time.time() print 'segmenting %s, cost %.3fs, aver=%.3fs' % (cnt, round_cost, round_cost / 100.0 ) line = f.readline() end_time = time.time() total_cost = (end_time - start_time) / 60.0 aver_cost = total_cost / float(cnt) print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % (cnt, total_cost, aver_cost) fw = open(data_dir + saved_file, 'w+') fw.write('\n'.join(res)) fw.close() print 'res is saved in %s' % (saved_file)
# ('Garland, TX', ('US', 'Today i used Spark')) tweets_city = us_tweets.map(lambda row: (row[0], row[1][2])) # ('Garland, TX', 'Today i used Spark') # Left outer join cities with tweets, so we only have the relevant tweets left top_tweets = top_cities.leftOuterJoin(tweets_city) # (city, (count, text)) # It maps each lowercase word with (city(idx0), city-count(idx1-0)) as key, and 1 as value # Iff. the length of the word is > 2 # We keep the city-tweet-count in order to sort the results later on words = top_tweets.flatMap(lambda row: ( ((w.lower()), (row[0],row[1][0])) for w in row[1][1].split(' ') if len(w) > 2) ) # ('#repost', ('Houston, TX', 21499)) # Load stopwords stopwords = load_stopwords() # ('a', None) # Subtract the sets words_filtered = words.subtractByKey(stopwords) # ('clerk', ('Houston, TX', 213)) # Map to ("city-word", "city-count", "word") as key, 1 as value words_intermediate = words_filtered.map(lambda row: ((row[1][0], row[1][1], row[0]),1)) # (('Houston, TX', 21499, 'even'), 1) # Aggregate over the "city-word"-keys words_counted = words_intermediate.reduceByKey(lambda a,b: a+b) # (('Manhattan, NY', 495, 'new'), 158) # Sort words by frequency
'--right-pad-symbol', help='Right pad symbol at the beginning of sentences.', default='</s>') parser.add_argument('--no-padding', dest='padding', action='store_false') args = parser.parse_args() if not args.padding: args.left_pad_symbol = None args.right_pad_symbol = None if args.skipgram and args.n != 2: print('Skipgrams only allowed for n = 2.', file=sys.stderr) sys.exit(1) print(args) file_paths = scan_all_files(args.root_dir) stopword_list = load_stopwords(args.stopwords) iq, oq = Queue(), Queue() processes = [] total_files_num = len(file_paths) total_words_num = 0 total_counter = Counter() num_processes = cpu_count() def finish(): for p in processes: p.terminate() iq.close() oq.close() end = timer() print(f'Computation took {end - start} seconds in total.')
from zhon import hanzi from typing import List, Set # Own customized modules from global_variables import * from util import load_stopwords jieba.load_userdict(USERDICT_FILEPATH) # regular expression PUNC_REGEX = r"^[{} \s]+$".format(string.punctuation + hanzi.punctuation) NUM_REGEX = r"^[0-9]*\.?[0-9]+$" UNIT_REGEX = r"^([0-9]*)(mm|cm|m)?$" ALPHANUM_REGEX = r"^[a-zA-Z0-9]+$" stopwords = load_stopwords(STOPWORDS_FILEPATH) vectorizer = joblib.load(VECTORIZER_FILEPATH) model = joblib.load(MODEL_FILEPATH) def segment(text: str, stopwords: Set, lowercase: bool = True) -> str: if lowercase: text = text.lower() words = [] jieba_res = jieba.cut(text) for w in jieba_res: if len(w) <= 1: continue if w in stopwords: continue if re.match(PUNC_REGEX, w):