Example #1
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
Example #2
0
    doc_num = len(documents)
    new_documents = []
    titles = []
    froms = []
    dates = []
    for index, document in documents.items():
        count += 1
        if count > max_doc:
            break

        print '\r', count, '/', doc_num,
        text = document['text'] + (' ' + index) * title_weight  # incorporate title information
        from_name = document['from']
        date = document['date']

        cleaned = clean_text(text)  # delete irrelevant characters

        document = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            document.append(word)

        # convert compound word into one token
        document = convert_compound(document)

        # filter stop words, long words, and non-english words
        document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()]

        new_documents.append(document)
        titles.append(index)
Example #3
0
        access_token_secret='c0S9NKtzXU9dyRrsVqura0EIzXFBIWro3rvJBkrCdSyJe')

    print api.VerifyCredentials()

    # wikis = unpickle('data/others/wikis.pkl')
    # keywords = wikis.keys()
    # keywords = get_keywords(model_path='data/model/tweets_100.lda')
    keywords = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
                'v', 'w', 'x', 'y', 'z'] * 100
    tweets = []
    for i, keyword in enumerate(keywords):
        try:
            print "{}th done".format(i)
            res = api.GetSearch(term=keyword, lang='english', count=25)
            for tweet in res:
                tweets.append(unicode(tweet.AsDict()['text']).replace('\n', ''))
        except Exception as e:
            print e

        if i % 150 == 0 and i > 0:
            f = open(file_name, 'ab')
            csvWriter = csv.writer(f)
            rx = re.compile('\W+')
            for tweet in tweets:
                tweet = clean_text(tweet)
                tweet = rx.sub(' ', tweet).strip()
                csvWriter.writerow([tweet])
            tweets = []
            f.close()
            sleep(60 * 15)  # avoid exceeding the API limit