def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
doc_num = len(documents) new_documents = [] titles = [] froms = [] dates = [] for index, document in documents.items(): count += 1 if count > max_doc: break print '\r', count, '/', doc_num, text = document['text'] + (' ' + index) * title_weight # incorporate title information from_name = document['from'] date = document['date'] cleaned = clean_text(text) # delete irrelevant characters document = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') document.append(word) # convert compound word into one token document = convert_compound(document) # filter stop words, long words, and non-english words document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()] new_documents.append(document) titles.append(index)
access_token_secret='c0S9NKtzXU9dyRrsVqura0EIzXFBIWro3rvJBkrCdSyJe') print api.VerifyCredentials() # wikis = unpickle('data/others/wikis.pkl') # keywords = wikis.keys() # keywords = get_keywords(model_path='data/model/tweets_100.lda') keywords = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] * 100 tweets = [] for i, keyword in enumerate(keywords): try: print "{}th done".format(i) res = api.GetSearch(term=keyword, lang='english', count=25) for tweet in res: tweets.append(unicode(tweet.AsDict()['text']).replace('\n', '')) except Exception as e: print e if i % 150 == 0 and i > 0: f = open(file_name, 'ab') csvWriter = csv.writer(f) rx = re.compile('\W+') for tweet in tweets: tweet = clean_text(tweet) tweet = rx.sub(' ', tweet).strip() csvWriter.writerow([tweet]) tweets = [] f.close() sleep(60 * 15) # avoid exceeding the API limit