def getBagOfWords(documents, stopWords, minThreshold, maxThreshold):
    vectorizer = CountVectorizer()
    vectorizer.stop_words = stopWords
    vectorizer.min_df = minThreshold
    vectorizer.max_df = maxThreshold
    X = vectorizer.fit_transform(documents)
    return vectorizer, X.toarray()
Example #2
0
def pipeline_process_data(paths, haveTarget = False, dataNum = 500):
    '''
        input:

        output:
    '''
    memoize=True

    if not haveTarget:
        for path in paths:
            if not os.path.exists(encode_url(path)):
                memoize = False
            else:
                memoize = True
            print("\nloading news from ", path)
            paper = newspaper.build(path, memoize_articles = memoize)
            urls = []
            f = open(encode_url(path, "URLData/"), 'w')
            for article in paper.articles:
                urls.append(article.url)
                f.write(article.url + "\n")
            f.close()


    if haveTarget:
        feature, target = _getData_pos_neg(_parse(paths), dataNum, haveTarget = True)
    else:
        urls = []
        for path in paths:
            f = open(encode_url(path, "URLData/"), 'r')
            urls.extend(f.readlines())
        feature, data =_getData_pos_neg(urls, dataNum, haveTarget = False)

    vectorizer = CountVectorizer(min_df=1)

    vectorizer.stop_words = stopwords.words('english')

    vectorizer.tokenizer = bigrams_Tokenizer()
    # vectorizer.tokenizer = RegexpTokenizer(r'\w+')
    vectorizer.max_features = 2000
    feature_matrix = vectorizer.fit_transform(feature)

    transformer = TfidfTransformer().fit(feature_matrix)
    feature_matrix = transformer.transform(feature_matrix)

    if haveTarget:
        return feature_matrix, target   
    else:
        return feature_matrix, data
    

# ##test code
# if __name__ == "__main__":
    # X,Y = pipeline_process_data('/Users/gaoqin/Downloads/reviews_Video_Games.json.gz')
    # X = pipeline_process_data('https://www.huffingtonpost.com/')
    # count = [0,0,0,0,0]
    # for i in Y[400:]:
    #   count[int(i - 1)] += 1
    # print (count)
def estimate_stop_words(stop_words, classifier, X, y):
    scores = {}
    vectorizer = CountVectorizer()
    for lbl, stop in stop_words.items():
        vectorizer.stop_words=stop
        pipeline = Pipeline([('countvectorizer', vectorizer),
                             (type(classifier).__name__, classifier)])
        scores[lbl] = validate(pipeline, X, y, cv=n_cv).mean()
        print 'Score for %s: %f' % (lbl, scores[lbl])
    return scores
Example #4
0
def vectorize_columnTfIdf(dataframe,
                          column_name,
                          vectorizer=None,
                          n_samples=None,
                          tf_idf=False):

    more_stopwords = []
    more_stopwords = ['00', '000', '0000', '0003', '0004', '0004', '0005']
    more_stopwords += stopwords.words('english')
    more_stopwords += stopwords.words('japanese')
    more_stopwords += stopwords.words('chinese')
    more_stopwords += stopwords.words('arabic')
    more_stopwords += stopwords.words('korean')
    more_stopwords += stopwords.words('russian')

    if vectorizer is None:
        print(
            "No Vectorizer is explicitly specified. Using CountVectorizer as default one. "
        )
        column_vectorizer = CountVectorizer(
            min_df=1, max_df=0.99,
            stop_words=more_stopwords)  #default vectorizer
    else:
        column_vectorizer = vectorizer
        column_vectorizer.stop_words = more_stopwords

    if column_name in dataframe.columns.values.tolist():

        if n_samples is None:
            column_df = dataframe[
                column_name]  #select all the samples from the column passed as param.
            print len(column_df)
        else:
            #column_df = dataframe[column_name].iloc[:n_samples] #select all the samples from the column passed as param.
            column_df = dataframe[column_name].iloc[:n_samples]
        fmatrix = column_vectorizer.fit_transform(column_df)

        if (tf_idf is True):

            tfidf_transformer = TfidfTransformer(norm='l2').fit(fmatrix)
            tfidfNormalzedmatrix = tfidf_transformer.transform(fmatrix)
            fmatrix = tfidfNormalzedmatrix

        dataframe_f = pd.DataFrame(
            fmatrix.todense(), columns=column_vectorizer.get_feature_names())
        print(
            "formed dataframe of size:("
        ), dataframe_f.index.max() + 1, ",", dataframe_f.head(1).shape[1], ")"

        return dataframe_f, fmatrix, column_vectorizer
    else:
        print("No column found")
Example #5
0
def vectorize_columnTfIdf(dataframe,column_name,vectorizer=None, n_samples=None, tf_idf=False):
    
    more_stopwords = []
    more_stopwords  = ['00','000','0000','0003','0004','0004','0005'] 
    more_stopwords += stopwords.words('english')
    more_stopwords += stopwords.words('japanese') 
    more_stopwords += stopwords.words('chinese')
    more_stopwords += stopwords.words('arabic')
    more_stopwords += stopwords.words('korean')
    more_stopwords += stopwords.words('russian')    
    
    if vectorizer is None:
        print("No Vectorizer is explicitly specified. Using CountVectorizer as default one. ")
        column_vectorizer = CountVectorizer(min_df=1, 
                                            max_df= 0.99, 
                                            stop_words=more_stopwords) #default vectorizer
    else:
        column_vectorizer = vectorizer
        column_vectorizer.stop_words = more_stopwords
    
    if column_name in dataframe.columns.values.tolist():
        
        if n_samples is None:
            column_df = dataframe[column_name] #select all the samples from the column passed as param. 
            print len(column_df)
        else:
            #column_df = dataframe[column_name].iloc[:n_samples] #select all the samples from the column passed as param.
            column_df = dataframe[column_name].iloc[:n_samples] 
        fmatrix = column_vectorizer.fit_transform(column_df)   
        
        if(tf_idf is True):
            
            tfidf_transformer  = TfidfTransformer(norm='l2').fit(fmatrix)
            tfidfNormalzedmatrix = tfidf_transformer.transform(fmatrix)
            fmatrix = tfidfNormalzedmatrix
            
        dataframe_f = pd.DataFrame(fmatrix.todense(), columns=column_vectorizer.get_feature_names())
        print("formed dataframe of size:("),dataframe_f.index.max()+1,",", dataframe_f.head(1).shape[1],")"
        
        return dataframe_f, fmatrix, column_vectorizer
    else:
        print("No column found")
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(
        "../../data/lemmatized_segments/soi-meme-full-lemma.csv")

    # Create CountVectorizer to get Document-Term matrix
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    vectorizer.stop_words = load_stop_words("../../data/stopwords-fr.txt")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)

    # train vectorizer on corpus
    dt_matrix = vectorizer.fit_transform(proc_corpus_text_only)

    feature_names = vectorizer.get_feature_names()
    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    print("initialize model")
    scores = []
    for i in range(1, 10):
        print("____Running_" + str(i) + "_Topics___")
        lda = LatentDirichletAllocation(n_components=i,
                                        max_iter=400,
                                        learning_method='batch',
                                        random_state=55,
                                        evaluate_every=5)

        info = cross_validate(lda, dt_matrix, scoring=perplexity_score, cv=10)
        scores.append(info["test_score"])
    for score in scores:
        print(list(score))
    return 0
Example #7
0
def getKeywords(model, data):
    print("generating keywords")

    vectorizer = CountVectorizer(min_df=1)

    vectorizer.stop_words = stopwords.words('english')

    vectorizer.tokenizer = keyword_generator()
    # vectorizer.tokenizer = RegexpTokenizer(r'\w+')
    feature_matrix = vectorizer.fit_transform(data)

    # transformer = TfidfTransformer().fit(feature_matrix)
    # feature_matrix = transformer.transform(feature_matrix)

    vocab = list(vectorizer.get_feature_names())

    counts = normalize(feature_matrix.sum(axis=0).A1)
    from collections import Counter
    freq_distribution = Counter(dict(zip(vocab, counts)))
    res = dict(freq_distribution.most_common(500))
    badwords = open("bad-words.txt", "r").readlines()
    badwords = set(word.strip() for word in badwords)
    nres = dict()
    for words in res:
        for word in words.split():
            if word in badwords:
                nres[words] = res[words]
                print(words , nres[words])
                continue
    import json
    json_str = json.dumps(nres)
    # print(json_str)
    f = open(model + "_keywords.json", "w")
    f.write(json_str)
    f.close()
    return freq_distribution
# metrics
from sklearn import metrics

spam = pd.read_csv("./spam.csv", encoding="ISO-8859-1")

X = spam.v2
y = spam.v1

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=1)

vectorizer = CountVectorizer()
vectorizer.stop_words = text.ENGLISH_STOP_WORDS
X_train = vectorizer.fit_transform(X_train)

mnb = MultinomialNB()
mnb.fit(X_train, y_train)


def test():
    y_pred = mnb.predict(vectorizer.transform(X_test))
    print("Multinomial Naive Bayes on Spam SMS Dataset Accuracy (in %):",
          metrics.accuracy_score(y_test, y_pred) * 100)


def predict(text):
    if not (isinstance(text, list)):
        text = [text]
Example #9
0
def hashtagTracker(request):

    if request.GET.get('num1'):
        hashtag = request.GET['num1']
        # print("\033[1m" + "Scraping/analyzing posts for " + hashtag + "..." + "\033[0m")
        page = requests.get("https://www.instagram.com/explore/tags/" +
                            hashtag[1:])
        posts = json.loads(page.text[page.text.find("window._sharedData") +
                                     21:page.text.find("};</script>") + 1])
        postCount = posts["entry_data"]["TagPage"][0]["graphql"]["hashtag"][
            "edge_hashtag_to_media"]["count"]
        minTopLikes = 0
        meanTopLikes = 0
        medianRecentTime = 0
        percentRecentVids = 0

        if postCount != 0:
            i = 0
            totalTop = 0
            for post in posts["entry_data"]["TagPage"][0]["graphql"][
                    "hashtag"]["edge_hashtag_to_top_posts"]["edges"]:
                if post["node"]["edge_liked_by"][
                        "count"] < minTopLikes or i == 0:
                    minTopLikes = post["node"]["edge_liked_by"]["count"]
                totalTop += post["node"]["edge_liked_by"]["count"]
                i += 1
            meanTopLikes = totalTop / i
            print("Looked at " + str(i) + " top posts...")

            j = 0
            totalTimeList = []
            totalRecentVids = 0
            for post in posts["entry_data"]["TagPage"][0]["graphql"][
                    "hashtag"]["edge_hashtag_to_media"]["edges"]:
                totalTimeList.append(post["node"]["taken_at_timestamp"])
                if post["node"]["is_video"]:
                    totalRecentVids += 1

                j += 1
                if j == 100:
                    break
            if j != 1:
                medianRecentTime = np.median(np.diff(sorted(totalTimeList)))
            percentRecentVids = (totalRecentVids / j) * 100
            print("Looked at " + str(j) + " recent posts...")
            tags = []

            for post in posts["entry_data"]["TagPage"][0]["graphql"][
                    "hashtag"]["edge_hashtag_to_related_tags"]["edges"]:
                tags.append(post["node"]["name"])

            ExtraTags = []
            for post in posts["entry_data"]["TagPage"][0]["graphql"][
                    "hashtag"]["edge_hashtag_to_top_posts"]["edges"]:
                ExtraTags.append((
                    post["node"]["edge_media_to_caption"]["edges"][0]["node"]))
            print((ExtraTags))

            caption = []
            for sentance in ExtraTags:
                caption.append(sentance['text'])
            # print(caption)

            arrayOfHashs = []
            for hash in caption:
                hash1 = ''.join(hash)
                arrayOfHashs.append(extract_hash_tags(hash1))

            print(arrayOfHashs)

            ListOfHashs = [y for x in arrayOfHashs for y in x]
            print(ListOfHashs)

            count_model = CountVectorizer(
                ngram_range=(1, 1))  # default unigram model
            # count_model.min_df=2
            count_model.max_features = 15
            count_model.stop_words = ['01', '05', '2layersprotection']
            X = count_model.fit_transform(ListOfHashs)
            AIRec = count_model.get_feature_names()

            ListOfHashsText = ' '.join(ListOfHashs)
            print(AIRec)

            count_list = X.toarray().sum(axis=0)
            count_list = dict(zip(count_model.get_feature_names(), count_list))
            hashRankValues = list(count_list.values())
            hashRankWords = list(count_list.keys())

            videoCount, ImgCount = video_count(
                posts["entry_data"]["TagPage"][0]["graphql"]["hashtag"]
                ["edge_hashtag_to_media"]["edges"])
            LikesCount = likes_count(
                posts["entry_data"]["TagPage"][0]["graphql"]["hashtag"]
                ["edge_hashtag_to_media"]["edges"])

            hashRankWords = str(hashRankWords).replace("\'", "\"")

        else:
            print("\033[93m" + "No posts exist for this hashtag" + "\033[0m")

        # outputFile.write(hashtag + "," + str(postCount) + "," + str(minTopLikes) + "," + str(meanTopLikes) + "," + str(
        # medianRecentTime) + "," + str(percentRecentVids) + "\n")

        return render(
            request, "dashboard.html", {
                'hashtag': hashtag,
                'postCount': postCount,
                'minTopLikes': minTopLikes,
                'meanTopLikes': meanTopLikes,
                'medianRecentTime': medianRecentTime,
                'percentRecentVids': percentRecentVids,
                'tags': tags,
                'AIRec': AIRec,
                'videoCount': videoCount,
                'LikesCount': LikesCount,
                'totalTop': totalTop,
                'ImgCount': ImgCount,
                'ListOfHashsText': ListOfHashsText,
                'hashRankWords': hashRankWords,
                'hashRankValues': hashRankValues
            })
        res = 0
    else:

        return render(
            request, "dashboard.html", {
                'hashtag': 0,
                'postCount': 0,
                'minTopLikes': 0,
                'meanTopLikes': 0,
                'medianRecentTime': 0,
                'percentRecentVids': 0,
                'tags': 0,
                'AIRec': 0,
                'videoCount': 0,
                'LikesCount': 0,
                'totalTop': 0,
                'ImgCount': 0,
                'ListOfHashsText': 0,
                'hashRankWords': 0,
                'hashRankValues': 0
            })
Example #10
0
def bow_vectorizer(data):
    print("Bag Of Words:")
    bow_vect = CountVectorizer(encoding='utf-8')
    bow_vect.stop_words = read_stop_words()
    bow_vect.fit_transform(data)
    return bow_vect
Example #11
0
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(
        "../../data/lemmatized_segments/soi-meme-full-lemma.csv")

    # Create CountVectorizer to get Document-Term matrix
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    vectorizer.stop_words = load_stop_words("../../data/stopwords-fr.txt")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)

    # train vectorizer on corpus
    dt_matrix = vectorizer.fit_transform(proc_corpus_text_only)

    feature_names = vectorizer.get_feature_names()
    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    # print("initialize model")
    for i in range(1, 5):
        print("____Running_" + str(i) + "_Topics___")
        for j in range(0, 3):
            lda = LatentDirichletAllocation(n_components=i,
                                            max_iter=400,
                                            learning_method='batch',
                                            random_state=random.randint(
                                                0, 100),
                                            evaluate_every=5)

            # train the model on the corpus and get a document topic matrix for the corpus
            print('fit model to corpus')
            doc_topic_matrix = lda.fit_transform(dt_matrix)
            topic_term_matrix = lda.components_

            # print("visualizing")
            # # # visualize(doc_topic_matrix)
            # #
            # print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix)))
            #
            # print("running elbow")
            # # print topics, 10 is the number of words in the topic dist to display (e.g. top 10)
            print("Perplexity:", lda.perplexity(dt_matrix))
            print("Log likelihood: ",
                  lda.score(dt_matrix) / get_num_tokens(dt_matrix))
            print("N_ITER: ", lda.n_iter_)
            print("Random State: ", lda.random_state)
            topic_str_list = print_topics(lda, feature_names, 10)
    # run_elbow(lda, feature_names)
    # #
    # # for i in range(0, len(concepts)):
    # #     query_list = concepts[i]
    # #     topicid_list = get_topics_w_query(topic_term_matrix, TOP_N_WORDS, feature_names, query_list)
    # #     seg_list, num_rel_segs = get_segs_w_query(doc_topic_matrix, topicid_list, 10, query_list)
    # #
    # #     if len(seg_list) > 0:
    # #         write_output_file_xlsx(query_list, topic_str_list, topicid_list, filepath, num_segs, seg_list, num_rel_segs, text_corpus)
    # #
    # #

    return 0