def do_lda(num_topics, passes, iterations, chunksize, tfidf, wiki_path=None): model_name = 'tweets_' logging.info('Loading user dictionary...') dictionary = corpora.Dictionary.load('data/dictionary/tweets.dict') corpus = dictionary.corpus if tfidf is True: logging.info('Computing TF-IDF...') tfidf_model = TfidfModel(corpus, normalize=False) corpus = tfidf_model[corpus] logging.info('Transforming the corpus...') corpus = [tfidf_model[corpus] for corpus in corpus] model_name += 'tfidf_' if wiki_path is not None: model_name += 'wiki_' logging.info('Performing LDA on user corpus...') model, vectors, ids = perform_lda(dictionary=dictionary, corpus=corpus, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=chunksize, wiki_path=wiki_path) model.print_topics(topics=num_topics, topn=10) model.save('data/model/' + model_name + str(num_topics) + '.lda') enpickle(vectors, 'data/vector/' + model_name + str(num_topics) + '.pkl') enpickle(ids, 'data/vector/ids.pkl')
def crawl_wiki(model_path): engine = Wikipedia(license=None, throttle=1.0, language='en') wikis = {} keywords = get_keywords(model_path=model_path, threshold=0.001) for keyword in keywords: stop = False while stop is False: try: article = engine.search(query=keyword) except Exception as e: print str(e) article = None if type(article) is pattern.web.WikipediaArticle: if article.disambiguation is False: print '\nretrieving', keyword, '...', wikis[keyword] = {} wikis[keyword]['keyword'] = keyword wikis[keyword]['text'] = article.plaintext() stop = True else: print '\n[', keyword, '] leads to disambiguation page!', stop = True if '-' in keyword: keyword = re.sub('-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False else: print '\n[', keyword, '] doesn\'t exist on wikipedia!', stop = True if '-' in keyword: keyword = re.sub('-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False enpickle(wikis, 'data/others/wikis.pkl') print '\n' return wikis
count = 0 doc_num = len(wikis) new_wikis = [] keywords = [] for keyword, wiki in wikis.items(): count += 1 print '\r', count, '/', doc_num, text = wiki['text'] cleaned = clean_text(text) # delete irrelevant characters wiki = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') wiki.append(word) # convert compound word into one token wiki = convert_compound(wiki) # filter stop words, long words, and non-english words wiki = [w for w in wiki if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] # FIXME: this allows non-english characters to be stored new_wikis.append(wiki) keywords.append(keyword) print '\n' logging.info('Saving wiki corpus...') enpickle(new_wikis, 'data/processed/wikis.pkl')
continue if not text_index.has_key(text): text_index[text] = '' tweets[id] = text # other english tweets other_file = open('data/original/agreegatedEnglishTweets.csv', 'rb') other_csv = csv.reader(other_file) for row in other_csv: id, text = row id = 'other_' + id if len(text) < 10: continue if not text_index.has_key(text): text_index[text] = '' tweets[id] = text # additional tweets other_file = open('data/original/additional.csv', 'rb') other_csv = csv.reader(other_file) for row in other_csv: id, text = row id = 'additional_' + id if len(text) < 10: continue if not text_index.has_key(text): text_index[text] = '' tweets[id] = text enpickle(tweets, 'data/processed/tweets.pkl')