def get_top_sectors(): synonyms = load_synonyms('./datasets/sinonimos.csv') synonyms1 = load_synonyms('./datasets/sinonimos2.csv') dictionary = load_words() stop_words = load_stop_words('./datasets/stop-words.txt') routes = load_routes('./datasets/routes.txt') counter = Counter() with open('counter.txt') as fp: counter = pickle.load(fp) topRoutes = set(counter.elements()) sectorGraph = get_graph() listRoutes = list(topRoutes) topSectors = [] for avenue in listRoutes: for (x, y) in sectorGraph.edges(): routesEdge = sectorGraph.edge[x][y]['routes'] for route in routesEdge: processedRoute = process_tweet(route, synonyms, synonyms1, dictionary, stop_words) if (processedRoute.find(avenue) > -1): topSectors.append({'from': x, 'to': y}) return json.dumps(topSectors)
def count_routes(): synonyms = load_synonyms('./datasets/sinonimos.csv') synonyms1 = load_synonyms('./datasets/sinonimos2.csv') dictionary = load_words() stop_words = load_stop_words('./datasets/stop-words.txt') routes = load_routes('./datasets/routes.txt') Tweets = retrieve_tweets() counter = Counter() if not file_is_empty('./datasets/counter.txt'): with open('./datasets/counter.txt') as fp: counter = pickle.load(fp) for tweet in Tweets: Tweet_words = process_tweet(tweet.text, synonyms, synonyms1, dictionary, stop_words) for route in routes: #busca el nombre de las rutas en un tweet para contarlas if re.search(route, Tweet_words): counter[route] += 1 print counter with open('counter.txt', 'wb') as fp: pickle.dump(counter, fp) fp.close()
def init(): print 'Loading training samples..' training_samples = utils.load_samples('../data/askubuntu/train_random.txt') print len(training_samples) print 'Loading dev samples..' dev_samples = utils.load_samples('../data/askubuntu/dev.txt') print len(dev_samples) print 'Loading test samples..' test_samples = utils.load_samples('../data/askubuntu/test.txt') print len(test_samples) print 'Loading corpus..' question_map = utils.load_corpus('../data/askubuntu/text_tokenized.txt') print len(question_map) print 'Loading stop words..' stop_words = utils.load_stop_words('../data/english_stop_words.txt') print len(stop_words) corpus_texts = map(lambda (t, b): t + ' ' + b, question_map.values()) print 'Loading embeddings..' embedding_map = utils.load_embeddings( '../data/pruned_askubuntu_android_vector.txt', corpus_texts, stop_words) print len(embedding_map) print utils.store_embedding_map(embedding_map) return (training_samples, dev_samples, test_samples, question_map, embedding_map)
def init(): print 'Loading askubuntu training samples..' askubuntu_training_samples = utils.load_samples( '../data/askubuntu/train_random.txt') print len(askubuntu_training_samples) print 'Loading askubuntu dev samples..' askubuntu_dev_samples = utils.load_samples('../data/askubuntu/dev.txt') print len(askubuntu_dev_samples) print 'Loading askubuntu test samples..' askubuntu_test_samples = utils.load_samples('../data/askubuntu/test.txt') print len(askubuntu_test_samples) print 'Loading askubuntu corpus..' askubuntu_question_map = utils.load_corpus( '../data/askubuntu/text_tokenized.txt') print len(askubuntu_question_map) print 'Loading android dev samples..' android_dev_samples = utils.load_samples_stupid_format( '../data/android/dev.pos.txt', '../data/android/dev.neg.txt') print len(android_dev_samples) print 'Loading android test samples..' android_test_samples = utils.load_samples_stupid_format( '../data/android/test.pos.txt', '../data/android/test.neg.txt') print len(android_test_samples) print 'Loading android corpus..' android_question_map = utils.load_corpus('../data/android/corpus.tsv') print len(android_question_map) print 'Loading stop words..' stop_words = utils.load_stop_words('../data/english_stop_words.txt') print len(stop_words) corpus_texts = map(lambda (t, b): t + ' ' + b, askubuntu_question_map.values() + android_question_map.values()) print 'Loading embeddings..' embedding_map = utils.load_embeddings( '../data/pruned_android_vector.txt', corpus_texts, stop_words) # pruned_askubuntu_android_vector.txt print len(embedding_map) print utils.store_embedding_map(embedding_map) return ( askubuntu_training_samples, askubuntu_dev_samples, askubuntu_test_samples, askubuntu_question_map, android_dev_samples, android_test_samples, android_question_map, embedding_map)
def main(): args = cmdparser() config = get_config(args.config) if args.preprocess: utils.preprocess(config['raw_path'], config['train_path'], config['dev_path'], config['label_path'], config['stop_word_path'], config['vocabulary_path']) labels = utils.load_labels(config['label_path']) vocabulary = utils.load_vocabulary(config['vocabulary_path']) stop_words = utils.load_stop_words(config['stop_word_path']) if args.dev: train(config, vocabulary, labels, stop_words, save_path='', mode='dev') elif args.train: if int(config['ensemble_size']) == 1: train(config, vocabulary, labels, stop_words, save_path=config['model_path'], mode='train') else: for i in range(int(config['ensemble_size'])): train(config, vocabulary, labels, stop_words, save_path=config[f'model_path_{i+1}'], mode='train') elif args.test: if int(config['ensemble_size']) == 1: test(config, vocabulary, labels, stop_words, save_path=[config['model_path']]) else: test_paths = [ config[f'model_path_{i+1}'] for i in range(int(config['ensemble_size'])) ] test(config, vocabulary, labels, stop_words, save_path=test_paths)
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stderr = codecs.getwriter('utf8')(sys.stderr) spark = SparkSession.builder.appName("LDA Batch Model").getOrCreate() sc = spark.sparkContext print AWS_ACCESS_KEY_ID print AWS_SECRET_ACCESS_KEY sc._jsc.hadoopConfiguration().set( "fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY) custom_stop_words = utils.load_stop_words(sc) texts_df = utils.load_texts(spark) pipeline = ml_utils.set_pipeline(custom_stop_words) model = pipeline.fit(texts_df) result = model.transform(texts_df) # Cluster the documents into three topics using LDA lda = LDA(k=NUMBER_OF_TOPICS, maxIter=5, featuresCol="vectors") lda_model = lda.fit(result) # Describe topics topics = lda_model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show(truncate=False) # Shows the result transformed = lda_model.transform(result) transformed.show(truncate=False) # Save and load model lda_model.save("s3a://current-models/LDAModel") sc.stop()
def count_routes(): synonyms = load_synonyms('./datasets/sinonimos.csv') synonyms1 = load_synonyms('./datasets/sinonimos2.csv') dictionary = load_words() stop_words = load_stop_words('./datasets/stop-words.txt') routes = load_routes('./datasets/routes.txt') Tweets = retrieve_tweets(); counter = Counter() if not file_is_empty('./datasets/counter.txt'): with open('./datasets/counter.txt') as fp: counter = pickle.load(fp) for tweet in Tweets: Tweet_words = process_tweet(tweet.text, synonyms, synonyms1, dictionary, stop_words) for route in routes: #busca el nombre de las rutas en un tweet para contarlas if re.search(route,Tweet_words): counter[route]+=1 print counter with open('counter.txt', 'wb') as fp: pickle.dump(counter, fp) fp.close()
import argparse from utils import load_stop_words, load_book, plot_freq, cut_word, clean_zh_corpus, count_freq if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--zh_corpus', default="诛仙.txt", type=str) parser.add_argument('--stop_words', default='停用词表.txt', type=str) args = parser.parse_known_args()[0] zh_corpus = args.zh_corpus stop_words_file = args.stop_words book = load_book(zh_corpus) book = clean_zh_corpus(corpus=book) text, words = cut_word(text=book) stop_words = load_stop_words(stop_words_file=stop_words_file) words, freq = count_freq(words=words, n=50, stop_words=stop_words) plot_freq(words=words, freq=freq, label_fs=14, ticks_fs=13) # 画出单词-频率图
# gensim接受语;料的格式为[[w,w,w],[w,w,w,w]] def train_word2vector(x): model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=5, sg=1) return model if __name__ == '__main__': stop_words = load_stop_words() raw_train_x, raw_train_y = load_training_data() raw_test_x = load_testing_data() seg_train_x = preprocess_x(raw_train_x, stop_words) seg_train_y = preprocess_y(raw_train_y) seg_test_x = preprocess_x(raw_test_x, stop_words) w2v_model = train_word2vector(seg_train_x + seg_train_y + seg_test_x) w2v_model.save('./w2v.model') index2word = w2v_model.wv.index2word word2index = {word: index for index, word in enumerate(index2word)} train_X = sens_to_ids(seg_train_x, word2index) train_Y = sens_to_ids(seg_train_y, word2index)