Ejemplo n.º 1
0
def get_top_sectors():
    
    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')

    counter = Counter()

    with open('counter.txt') as fp:
        counter = pickle.load(fp)

    topRoutes = set(counter.elements())

    sectorGraph = get_graph()

    listRoutes = list(topRoutes)

    topSectors = []

    for avenue in listRoutes:
        for (x, y) in sectorGraph.edges():
            routesEdge = sectorGraph.edge[x][y]['routes']
            for route in routesEdge:
                processedRoute = process_tweet(route, synonyms, synonyms1, dictionary, stop_words)

                if (processedRoute.find(avenue) > -1):
                    topSectors.append({'from': x, 'to': y})

    return json.dumps(topSectors)
Ejemplo n.º 2
0
def get_top_sectors():

    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')

    counter = Counter()

    with open('counter.txt') as fp:
        counter = pickle.load(fp)

    topRoutes = set(counter.elements())

    sectorGraph = get_graph()

    listRoutes = list(topRoutes)

    topSectors = []

    for avenue in listRoutes:
        for (x, y) in sectorGraph.edges():
            routesEdge = sectorGraph.edge[x][y]['routes']
            for route in routesEdge:
                processedRoute = process_tweet(route, synonyms, synonyms1,
                                               dictionary, stop_words)

                if (processedRoute.find(avenue) > -1):
                    topSectors.append({'from': x, 'to': y})

    return json.dumps(topSectors)
Ejemplo n.º 3
0
def count_routes():
    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')
    Tweets = retrieve_tweets()
    counter = Counter()

    if not file_is_empty('./datasets/counter.txt'):
        with open('./datasets/counter.txt') as fp:
            counter = pickle.load(fp)

    for tweet in Tweets:
        Tweet_words = process_tweet(tweet.text, synonyms, synonyms1,
                                    dictionary, stop_words)
        for route in routes:
            #busca el nombre de las rutas en un tweet para contarlas
            if re.search(route, Tweet_words):
                counter[route] += 1

    print counter
    with open('counter.txt', 'wb') as fp:
        pickle.dump(counter, fp)
        fp.close()
Ejemplo n.º 4
0
def init():
    print 'Loading training samples..'
    training_samples = utils.load_samples('../data/askubuntu/train_random.txt')
    print len(training_samples)

    print 'Loading dev samples..'
    dev_samples = utils.load_samples('../data/askubuntu/dev.txt')
    print len(dev_samples)

    print 'Loading test samples..'
    test_samples = utils.load_samples('../data/askubuntu/test.txt')
    print len(test_samples)

    print 'Loading corpus..'
    question_map = utils.load_corpus('../data/askubuntu/text_tokenized.txt')
    print len(question_map)

    print 'Loading stop words..'
    stop_words = utils.load_stop_words('../data/english_stop_words.txt')
    print len(stop_words)

    corpus_texts = map(lambda (t, b): t + ' ' + b, question_map.values())

    print 'Loading embeddings..'
    embedding_map = utils.load_embeddings(
        '../data/pruned_askubuntu_android_vector.txt', corpus_texts,
        stop_words)
    print len(embedding_map)
    print

    utils.store_embedding_map(embedding_map)

    return (training_samples, dev_samples, test_samples, question_map,
            embedding_map)
Ejemplo n.º 5
0
def init():
    print 'Loading askubuntu training samples..'
    askubuntu_training_samples = utils.load_samples(
        '../data/askubuntu/train_random.txt')
    print len(askubuntu_training_samples)

    print 'Loading askubuntu dev samples..'
    askubuntu_dev_samples = utils.load_samples('../data/askubuntu/dev.txt')
    print len(askubuntu_dev_samples)

    print 'Loading askubuntu test samples..'
    askubuntu_test_samples = utils.load_samples('../data/askubuntu/test.txt')
    print len(askubuntu_test_samples)

    print 'Loading askubuntu corpus..'
    askubuntu_question_map = utils.load_corpus(
        '../data/askubuntu/text_tokenized.txt')
    print len(askubuntu_question_map)

    print 'Loading android dev samples..'
    android_dev_samples = utils.load_samples_stupid_format(
        '../data/android/dev.pos.txt', '../data/android/dev.neg.txt')
    print len(android_dev_samples)

    print 'Loading android test samples..'
    android_test_samples = utils.load_samples_stupid_format(
        '../data/android/test.pos.txt', '../data/android/test.neg.txt')
    print len(android_test_samples)

    print 'Loading android corpus..'
    android_question_map = utils.load_corpus('../data/android/corpus.tsv')
    print len(android_question_map)
    
    print 'Loading stop words..'
    stop_words = utils.load_stop_words('../data/english_stop_words.txt')
    print len(stop_words)

    corpus_texts = map(lambda (t, b): t + ' ' + b,
                       askubuntu_question_map.values() + android_question_map.values())
    
    print 'Loading embeddings..'
    embedding_map = utils.load_embeddings(
        '../data/pruned_android_vector.txt', corpus_texts, stop_words)  # pruned_askubuntu_android_vector.txt
    print len(embedding_map)
    print

    utils.store_embedding_map(embedding_map)

    return (
        askubuntu_training_samples,
        askubuntu_dev_samples,
        askubuntu_test_samples,
        askubuntu_question_map,
        android_dev_samples,
        android_test_samples,
        android_question_map,
        embedding_map)
Ejemplo n.º 6
0
def main():
    args = cmdparser()
    config = get_config(args.config)
    if args.preprocess:
        utils.preprocess(config['raw_path'], config['train_path'],
                         config['dev_path'], config['label_path'],
                         config['stop_word_path'], config['vocabulary_path'])
    labels = utils.load_labels(config['label_path'])
    vocabulary = utils.load_vocabulary(config['vocabulary_path'])
    stop_words = utils.load_stop_words(config['stop_word_path'])

    if args.dev:
        train(config, vocabulary, labels, stop_words, save_path='', mode='dev')
    elif args.train:
        if int(config['ensemble_size']) == 1:
            train(config,
                  vocabulary,
                  labels,
                  stop_words,
                  save_path=config['model_path'],
                  mode='train')
        else:
            for i in range(int(config['ensemble_size'])):
                train(config,
                      vocabulary,
                      labels,
                      stop_words,
                      save_path=config[f'model_path_{i+1}'],
                      mode='train')
    elif args.test:
        if int(config['ensemble_size']) == 1:
            test(config,
                 vocabulary,
                 labels,
                 stop_words,
                 save_path=[config['model_path']])
        else:
            test_paths = [
                config[f'model_path_{i+1}']
                for i in range(int(config['ensemble_size']))
            ]
            test(config, vocabulary, labels, stop_words, save_path=test_paths)
Ejemplo n.º 7
0
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf8')(sys.stderr)

    spark = SparkSession.builder.appName("LDA Batch Model").getOrCreate()
    sc = spark.sparkContext

    print AWS_ACCESS_KEY_ID
    print AWS_SECRET_ACCESS_KEY
    sc._jsc.hadoopConfiguration().set(
        "fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key',
                                      AWS_SECRET_ACCESS_KEY)

    custom_stop_words = utils.load_stop_words(sc)
    texts_df = utils.load_texts(spark)

    pipeline = ml_utils.set_pipeline(custom_stop_words)
    model = pipeline.fit(texts_df)

    result = model.transform(texts_df)

    # Cluster the documents into three topics using LDA
    lda = LDA(k=NUMBER_OF_TOPICS, maxIter=5, featuresCol="vectors")
    lda_model = lda.fit(result)

    # Describe topics
    topics = lda_model.describeTopics(3)
    print("The topics described by their top-weighted terms:")
    topics.show(truncate=False)

    # Shows the result
    transformed = lda_model.transform(result)
    transformed.show(truncate=False)

    # Save and load model
    lda_model.save("s3a://current-models/LDAModel")

    sc.stop()
Ejemplo n.º 8
0
def count_routes():
    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')
    Tweets = retrieve_tweets();
    counter = Counter()
    
    if not file_is_empty('./datasets/counter.txt'):
        with open('./datasets/counter.txt') as fp:
            counter = pickle.load(fp)
    
    for tweet in Tweets:
        Tweet_words = process_tweet(tweet.text, synonyms, synonyms1, dictionary, stop_words)
        for route in routes:
            #busca el nombre de las rutas en un tweet para contarlas
            if re.search(route,Tweet_words):
                counter[route]+=1
                
    print counter
    with open('counter.txt', 'wb') as fp:
        pickle.dump(counter, fp)
        fp.close()
Ejemplo n.º 9
0
import argparse
from utils import load_stop_words, load_book, plot_freq, cut_word, clean_zh_corpus, count_freq

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--zh_corpus', default="诛仙.txt", type=str)
    parser.add_argument('--stop_words', default='停用词表.txt', type=str)
    args = parser.parse_known_args()[0]
    zh_corpus = args.zh_corpus
    stop_words_file = args.stop_words

    book = load_book(zh_corpus)
    book = clean_zh_corpus(corpus=book)
    text, words = cut_word(text=book)
    stop_words = load_stop_words(stop_words_file=stop_words_file)
    words, freq = count_freq(words=words, n=50, stop_words=stop_words)
    plot_freq(words=words, freq=freq, label_fs=14, ticks_fs=13)  # 画出单词-频率图
Ejemplo n.º 10
0
# gensim接受语;料的格式为[[w,w,w],[w,w,w,w]]


def train_word2vector(x):
    model = word2vec.Word2Vec(x,
                              size=250,
                              window=5,
                              min_count=5,
                              workers=12,
                              iter=5,
                              sg=1)
    return model


if __name__ == '__main__':
    stop_words = load_stop_words()
    raw_train_x, raw_train_y = load_training_data()
    raw_test_x = load_testing_data()

    seg_train_x = preprocess_x(raw_train_x, stop_words)
    seg_train_y = preprocess_y(raw_train_y)
    seg_test_x = preprocess_x(raw_test_x, stop_words)

    w2v_model = train_word2vector(seg_train_x + seg_train_y + seg_test_x)
    w2v_model.save('./w2v.model')

    index2word = w2v_model.wv.index2word
    word2index = {word: index for index, word in enumerate(index2word)}

    train_X = sens_to_ids(seg_train_x, word2index)
    train_Y = sens_to_ids(seg_train_y, word2index)