Esempio n. 1
0
def extract_bigrams(articleList, commentCount):
    featureMatrix = np.zeros([commentCount,100])

    index = 0
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    bagOfWords = []
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bagOfWords += stemmed_words
            bagOfWords.append("\n")
            
    tempVector = dict()
        
    #Create your bigrams
    bgs = nltk.bigrams(bagOfWords)

    fdist = nltk.FreqDist(bgs)   
    
    for k in fdist.keys()[:100]:
        tempVector[k] = 0
    
    
    theKeys = tempVector.keys()
    
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bgs = nltk.bigrams(stemmed_words)
            for word in (w for w in bgs if tempVector.has_key(w)):
                keyInd = theKeys.index(word)      
                featureMatrix[index][keyInd] += 1
                           
            index += 1
            if index % 100 == 0:
                print "extracted", index, "features"
        
            if index >= commentCount:
                break            
            
            
    
    
    print "non-zero",np.count_nonzero(featureMatrix)
    print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
    return featureMatrix
Esempio n. 2
0
def read_news24_comments(filename, skip=True, skip_mtn=False, limit = -1):
    # Short list of comments
    values = defaultdict(list)
    headers_news24 = ['article_id', 'comment_id','thread_root_id', 'user_id', 'likes','dislikes','reported','status','rating','date','author','article_title','article_body','comment_content','lemma_body','pos_body']
    f1 = open(filename, 'r')
    
    commentCount = 0
    totalCount = 0
    lessThanCount = 0
    mtnCount = 0
    
    for line in f1:
        temp = line.split('&')
        
        body = temp[13].lower()
        if skip:
            if len(words(body)) < WORD_MIN:
                lessThanCount += 1
                continue
            
        if skip_mtn:
            if "mtn" in body or "honda" in body or "toyota" in body or "form" in body or "camry" in body or "service" in body :
                mtnCount += 1
                continue
            
        commentCount += 1
        if commentCount % 10000 == 0:
            print "Read", commentCount, "comments"
            
        for i,v in enumerate(temp):
            values[headers_news24[i]].append(v)
    
   
    df_news24_large = pd.DataFrame(values)
    
    
    # Replace date with datetime
    def map_date(date):
        date_ret = strptime(date, "%Y-%m-%d %H:%M:%S.%f")      
        return date_ret
         
                                
    df_news24_large.date = df_news24_large.date.map(map_date) 
    
    
    # Add root for null roots
    cond = df_news24_large.thread_root_id == 'null'    
    df_news24_large.loc[cond, 'thread_root_id'] = df_news24_large['comment_id']
    print df_news24_large[cond].shape
    
   
    print "Saved",commentCount,"comments out of", totalCount
    print lessThanCount, "comments less than", WORD_MIN
    print mtnCount, "mtn comments"
    
    return df_news24_large
Esempio n. 3
0
def extract_sentence_values(articleList, commentList, parentList, commentCount):
    valueVector = np.empty([commentCount,4])
    index = 0
                 

       
    for commList in commentList.values():
        sumVotes = 0
        for comm in commList:
            sumVotes += comm.likeCount + comm.dislikeCount
            
        for comm in commList:
            sentences = nltk.sent_tokenize(comm.lemma_body)
            for sent in sentences:        
            
                tokens = nltk.regexp_tokenize(sent, pattern)
                theWords = words(comm.body)
                uniqueWords = set(theWords)
                
                if len(tokens) == 0 or len(uniqueWords) == 0:
                    continue
                
                
                ratio = (comm.likeCount) / (float(max(1,comm.likeCount+comm.dislikeCount)))
                
                
                #print ttest
                
                totalVotes = comm.likeCount + comm.dislikeCount
                    
                                           
                valueVector[index,0] = totalVotes
                valueVector[index,1] = ratio
                if comm.reported > 0:
                    valueVector[index,2] = 1
                else:
                    valueVector[index,2] = 0
                
                if comm.status == 1:
                    valueVector[index,3] = 0
                else:
                    valueVector[index,3] = 1                
                
                index += 1
                if index % 1000 == 0:
                    print "extracted", index, "values"
            
                if index >= commentCount:
                    break
            if index >= commentCount:
                break
                
    return valueVector
Esempio n. 4
0
def read_slashdot_comments(filename, skip=True):
    values = defaultdict(list)
    headers = ['article_id', 'comment_id', 'thread_root_id', 'parent_id', 'author', 'score', 'flag', 'date', 'wtf',
               'article_title', 'article_body', 'comment_title', 'has_link', 'comment_content', 'quoted_text']

    skippedCount = 0
    commentCount = 0
    f1 = open(filename, 'r')

    for line in f1:
        temp = line.split('\t')
        if len(temp) < 14:
            continue
        if len(words(temp[13])) == 0:
            continue

        for i, v in enumerate(temp):
            values[headers[i]].append(v)

    # Create Dataframe
    df_slashdot = pd.DataFrame(values)
    df_slashdot.drop('wtf', axis=1, inplace=True)
    # Decode Strings
    for col in df_slashdot.columns:
        df_slashdot[col] = df_slashdot[col].str.decode('iso-8859-1').str.encode('utf-8')

    # Add root for null roots
    cond = df_slashdot.thread_root_id == 'NULL'
    df_slashdot.loc[cond, 'thread_root_id'] = df_slashdot['comment_id']
    print df_slashdot[df_slashdot.thread_root_id == 'NULL'].shape


    # Replace date with datetime
    def map_date(date):
        date_ret = None
        try:
            date_ret = strptime(date, "<> on %A %B %d, %Y @%H:%M%p ()")
        except:
            date_ret = strptime(date, "on %A %B %d, %Y @%H:%M%p ()")
        return date_ret

    df_slashdot.date = df_slashdot.date.map(map_date)

    if (skip):
        df_slashdot = df_slashdot[df_slashdot['author'].str.lower() != 'anonymous coward']
        df_slashdot = df_slashdot[df_slashdot['score'] != '2']

    print "Done with comments"

    return df_slashdot
Esempio n. 5
0
def extract_feature_matrix(df_comments, df_thread_groupby):
    print "START"
    # Sentence Tokenizer
    sentencer = SentenceTokenizer()
    
    clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle')
        
    featureMatrix = np.empty([df_comments.shape[0],25])
    
    feature_dict = dict()
    for ix, row in df_comments.iterrows():
        feature_dict[row['comment_id']] = ix
    
    feature_count = 0
    
    for _,row in df_comments.iterrows():
        index = feature_dict[row['comment_id']]
        
        comm = row['comment_content'].decode('ASCII', 'ignore')
        tokens = words(comm)
        unique_tokens = set(tokens)
        sentences = sentencer.tokenize(comm)
        
        featureMatrix[index][3] =  len(comm)
        
        verb_fr, noun_fr, pronoun_fr = pos_freq(tokens)
        featureMatrix[index][4] = verb_fr
        featureMatrix[index][5] = noun_fr
        featureMatrix[index][6] = pronoun_fr
        
        featureMatrix[index][7] = capital_frequency(tokens)
        featureMatrix[index][8] = sent_frequency(sentences, '?')
        featureMatrix[index][9] = sent_frequency(sentences, '!')
        featureMatrix[index][10] = sentence_capital_frequency(sentences)
        
        featureMatrix[index][11] = entropy(comm)
        featureMatrix[index][12] = lexical_diversity(tokens)
        
        
        if len(tokens) == 0:
            featureMatrix[index][13] =  0
            featureMatrix[index][14] =  0
            featureMatrix[index][15] =  0
            featureMatrix[index][16] =  0
        else:
            spelt_wrong = missing_words(unique_tokens)
            bad_words_list = swears(unique_tokens)
            
            featureMatrix[index][13] =  len(spelt_wrong)
            featureMatrix[index][14] =  len(spelt_wrong)/float(len(unique_tokens))
            featureMatrix[index][15] =  len(bad_words_list)
            featureMatrix[index][16] =  len(bad_words_list)/float(len(unique_tokens))
            
            
        featureMatrix[index][19] =  F_K_score(sentences, tokens)
        
        testSet = dict()
        refWords = make_full_dict(tokens)
        testSet.update(refWords)
    
        probDist = clf.prob_classify(testSet)                
        sentiment = probDist.prob('pos')            
        subj_obj = get_subjectivity(probDist)
    
        polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf)
        featureMatrix[index][22] =  sentiment
        featureMatrix[index][23] =  subj_obj
        featureMatrix[index][24] =  polarity_overlap
        
        feature_count += 1
        if feature_count % 1000 == 0:
            print feature_count
    
    print "DONE"
    
    feature_count = 0
    # Grouped
    for _,group in df_thread_groupby:
        thread_comments = [row['comment_content'] for _,row in group.iterrows()]
        
        # Get average time
        sumTime = 0 
        count = 0                
        previous = mktime(group.iloc[0]['date'])
        first = mktime(group.iloc[0]['date'])
        
        # Average length
        sumLen = 0 
        
        
        thread_tokens = []    
        
        # Within Thread
        for _, row in group.iterrows():
            index = feature_dict[row['comment_id']]
            comm = row['comment_content'].decode('ascii','ignore')
            tokens = words(comm)
            sentences = sentencer.tokenize(comm)
            
            # Ongoing average time
            sumTime += mktime(row['date']) - previous
            count += 1            
            avgTime = sumTime/float(count)
            
            # Ongoing average length
            sumLen += len(words(row['comment_content']))
            avgLen = sumLen/float(count)
            
            ######################################################################
            # Get chunked sentences
            for sent in sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = [] 
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
                
                # The cumulative tokens up to this point
                thread_tokens += doc
            
            ######################################################################
            article_tokens = []
            article_sentences = sentencer.tokenize(row['article_body'])
            for sent in article_sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = []
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
            
            ######################################################################
            
            
            featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1))
            previous = mktime(row['date'])        
            
            featureMatrix[index][1] =  mktime(row['date']) - first  
            
            featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1))  
            
            featureMatrix[index][17] =  np.mean([termf(comm.count(w), tokens) for w in set(tokens)])  
            featureMatrix[index][18] =  tf_idf(comm, thread_comments)     
            
            featureMatrix[index][20] =  onSubForumTopic(tokens, thread_tokens)
            featureMatrix[index][21] =  onSubForumTopic(tokens, article_tokens)
    
    
            feature_count += 1
            if feature_count % 1000 == 0:
                print feature_count
    
    return featureMatrix