Exemple #1
0
def print_representative_np(df, product, n=50):
    def _identity_tokenizer(text):
        return text

    tfidf = TfidfVectorizer(tokenizer=_identity_tokenizer,
                            stop_words='english',
                            lowercase=False)
    try:
        result = tfidf.fit_transform(df['nounPhrases'])
    except Exception as e:
        df['posTagged'] = df['tokenizedSentences'].apply(
            lambda tokenizedSentences:
            [pos_tag(sentence) for sentence in tokenizedSentences])
        df['nounPhrases'] = df['posTagged'].apply(lambda posTagged: [
            np.lower()
            for np in flatten([extract_NP(tag) for tag in posTagged])
        ])
        result = tfidf.fit_transform(df['nounPhrases'])

    scores = zip(tfidf.get_feature_names(),
                 np.asarray(result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    print('=' * 30 + product + '=' * 30)
    for item in sorted_scores[:n]:
        print("{0:50} Score: {1}".format(item[0], item[1]))
    print()
    print()
def getKeyPhrases(student_summaryList, sennafile, method=None, MalformedFlilter=False, save2file=None):
    #read senna file
    sentences = SennaParser.SennaParse(sennafile)
    
    phrases = []
    for s in sentences:
        if method=='syntax':
            NPs = s.getSyntaxNP()
        else:
            NPs = []
        
        for np in NPs:
            if MalformedFlilter:
                if isMalformed(np): 
                    continue
        
            phrases.append(np.lower())
            
    return phrases
Exemple #3
0
def get_position_score(keyphrase_candidate_list, position_bias):
    length = len(keyphrase_candidate_list)
    position_score = {}
    for i, kc in enumerate(keyphrase_candidate_list):
        np = kc[0]
        p = kc[1][0]
        np = np.lower()
        if np in position_score:

            position_score[np] += 0.0
        else:
            position_score[np] = 1 / (float(i) + 1 + position_bias)
    score_list = []
    for np, score in position_score.items():
        score_list.append(score)
    score_list = softmax(score_list)

    i = 0
    for np, score in position_score.items():
        position_score[np] = score_list[i]
        i += 1
    return position_score
Exemple #4
0
def getKeyPhrases(student_summaryList,
                  sennafile,
                  method=None,
                  MalformedFlilter=False,
                  save2file=None):
    #read senna file
    sentences = SennaParser.SennaParse(sennafile)

    phrases = []
    for s in sentences:
        if method == 'syntax':
            NPs = s.getSyntaxNP()
        else:
            NPs = []

        for np in NPs:
            if MalformedFlilter:
                if isMalformed(np):
                    continue

            phrases.append(np.lower())

    return phrases
Exemple #5
0
def sampleComments(data):
    data['NounChunks'] = data['Comment'].apply(lambda x: nounPhraseChunking(x))
    data['NounPhrase'] = data['NounChunks'].apply(
        lambda x: extractNPPhrases(x))
    # Create a dictionary of noun phrases
    npDict = {}
    nounPhrasePos = {}
    nounPhraseNeg = {}
    nounPhraseUnc = {}

    for idx, nounPhrase in enumerate(data['NounPhrase']):
        if len(nounPhrase) == 0:
            continue

        for np in nounPhrase:
            np = np.lower()

            if np not in npDict:
                npDict[np] = 1
            else:
                npDict[np] += 1

            if data.iloc[idx].Sentiment == 'POS':
                if np in nounPhrasePos:
                    nounPhrasePos[np] += 1
                else:
                    nounPhrasePos[np] = 1

            if data.iloc[idx].Sentiment == 'NEG':
                if np in nounPhraseNeg:
                    nounPhraseNeg[np] += 1
                else:
                    nounPhraseNeg[np] = 1

            if data.iloc[idx].Sentiment == 'UNC':
                if np in nounPhraseUnc:
                    nounPhraseUnc[np] += 1
                else:
                    nounPhraseUnc[np] = 1
    df = pd.DataFrame(
        data=[npDict, nounPhraseNeg, nounPhrasePos, nounPhraseUnc
              ]).transpose()
    df.rename(columns={0: 'All', 1: 'Neg', 2: 'Pos', 3: 'Unc'}, inplace=True)
    df.fillna(0, inplace=True)

    df['PosPercentage'] = df.eval('Pos/All')
    df['NegPercentage'] = df.eval('Neg/All')
    df['Diff'] = df.eval('PosPercentage-NegPercentage')

    topPosNP = df[df['Diff'] > 0]
    topPosNP = topPosNP.sort_values(by=['Diff', 'All'], ascending=False)

    topNegNP = df[df['Diff'] < 0]
    topNegNP = topNegNP.sort_values(by=['Diff', 'All'],
                                    ascending=[True, False])

    if topNegNP.shape[0] > 10:
        topNegNPList = topNegNP.index[:10].tolist()
    else:
        topNegNPList = topNegNP.index.tolist()

    if topPosNP.shape[0] > 10:
        topPosNPList = topPosNP.index[:10].tolist()
    else:
        topPosNPList = topPosNP.index.tolist()

    indexListPos = []
    indexListNeg = []
    for idx, nounPhrase in enumerate(data['NounPhrase']):
        if len(nounPhrase) == 0:
            continue

        for np in nounPhrase:
            np = np.lower()

            if np in topPosNPList and data.iloc[idx].Sentiment == 'POS':
                indexListPos.append(idx)
            if np in topNegNPList and data.iloc[idx].Sentiment == 'NEG':
                indexListNeg.append(idx)
    samplePosComments = data.iloc[indexListPos].sort_values(by=['Polarity'],
                                                            ascending=True)
    samplePosComments.reset_index(drop=True, inplace=True)

    sampleNegComments = data.iloc[indexListNeg].sort_values(by=['Polarity'],
                                                            ascending=True)
    sampleNegComments.reset_index(drop=True, inplace=True)
    return samplePosComments, sampleNegComments, topPosNPList, topNegNPList
data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame.from_dict(data)
df = df.drop(columns = ['overall', 'reviewTime', 'summary', 'unixReviewTime'])

df = df[:10000]

df['sentences'] = df['reviewText'].apply(segment_sent)
df['tokenizedSentences'] = df['sentences'].apply(lambda sentences: [tokenize(sentence) for sentence in sentences])
df['cleanedTokenizedSentences'] = df['sentences'].apply(lambda sentences: [preprocessSentence(sentence) for sentence in sentences])
cleanedTokenizedSentences = flatten(df['cleanedTokenizedSentences'])

df['posTagged'] = df['tokenizedSentences'].apply(lambda tokenizedSentences: [pos_tag(sentence) for sentence in tokenizedSentences])
df['nounPhrases'] = df['posTagged'].apply(lambda posTagged: [np.lower() for np in flatten([extract_NP(tag) for tag in posTagged])])
df['uniqueNounPhrases'] = df['nounPhrases'].apply(set).apply(list)

import gensim
word2vec_model = gensim.models.Word2Vec(
    cleanedTokenizedSentences,
    seed=42,
    workers=10,
    size=150,
    min_count=2,
    window=10)

# NP Summary Per Review
# word2vec_model.train(sentences=cleanedTokenizedSentences, total_examples=len(cleanedTokenizedSentences), epochs=10)
# word2vec_model.save("word2vec_model1.w2v")
# print("Model saved")
Exemple #7
0
        w = ''
        for word in term:
            w += word + ' '

        noun_phrases.append(w.strip())

    for term in terms2:
        w = ''
        for word in term:
            w += word + ' '

        noun_phrases.append(w.strip())

    a = []
    for np in noun_phrases:
        a.extend(refine_noun_phrase(np.lower()))

    noun_phrases = list(a)

    left = len(set(phrase_list) - set(noun_phrases))
    overall_recall += L - left

    print(L, '\t', left / L)

    #     if left/L > 0.3 and L>10:
    #         print(left)
    #         print(cp_file)
    #         print(phrase_list)
    #         print()
    #         print((set(phrase_list)- set(noun_phrases))
Exemple #8
0
def main(data_file, seed):

    # set seed
    np.random.seed(seed)

    # load in a pd.df
    data = [json.loads(line) for line in data_file]
    df = pd.DataFrame.from_dict(data)

    # make directory for images
    if not os.path.exists(IMAGES_DIRECTORY):
        os.mkdir(IMAGES_DIRECTORY)
    # make directory for representative words
    if not os.path.exists(REP_DIRECTORY):
        os.mkdir(REP_DIRECTORY)

    print_header('3.2.1 Popular Products and Frequent Reviewers', 50)

    ## 3.2.1 get top 10 products
    top_10_products = df['asin'].value_counts().head(10).reset_index().rename(
        columns={
            'index': 'productID',
            'asin': 'reviewCount'
        })
    print_header('Top 10 products', char='-')
    print(top_10_products)

    #     productID  reviewCount
    # 0  B005SUHPO6          836
    # 1  B0042FV2SI          690
    # 2  B008OHNZI0          657
    # 3  B009RXU59C          634
    # 4  B000S5Q9CA          627
    # 5  B008DJIIG8          510
    # 6  B0090YGJ4I          448
    # 7  B009A5204K          434
    # 8  B00BT7RAPG          431
    # 9  B0015RB39O          424

    ## 3.2.1 get top 10 reviewers
    top_10_reviewers = df['reviewerID'].value_counts().head(
        10).reset_index().rename(columns={
            'index': 'reviewerID',
            'reviewerID': 'reviewCount'
        })
    print_header('Top 10 reviewers', char='-')
    print(top_10_reviewers)

    #        reviewerID  reviewCount
    # 0  A2NYK9KWFMJV4Y          152
    # 1  A22CW0ZHY3NJH8          138
    # 2  A1EVV74UQYVKRY          137
    # 3  A1ODOGXEYECQQ8          133
    # 4  A2NOW4U7W3F7RI          132
    # 5  A36K2N527TXXJN          124
    # 6  A1UQBFCERIP7VJ          112
    # 7   A1E1LEVQ9VQNK          109
    # 8  A18U49406IPPIJ          109
    # 9   AYB4ELCS5AM8P          107

    ## 3.2.2 Sentence segmentation
    print_header('3.2.2 Sentence Segmentation', 50)

    df['sentences'] = df['reviewText'].apply(segment_sent)
    df['sentenceCount'] = df['sentences'].apply(len)

    # plotting for number of sentences
    plot_bar(df['sentenceCount'], \
            title = 'Distribution of Number of Sentences for Each Review', \
            x_label = "Sentence Count", y_label = "Review Count", countplot = False)

    plot_bar(df['sentenceCount'].clip(0, 50), \
            title = 'Distribution of Number of Sentences for Each Review (Clipped)', \
            x_label = "Sentence Count (Clipped)", y_label = "Review Count", countplot = True)

    # get 5 random reviews to do sentence segmentation and display results
    reviews = df['reviewText']
    _seed = 43  # To give us an interesting result
    random_reviews = reviews.sample(5, random_state=_seed)
    random_reviews = pd.DataFrame(
        random_reviews,
        columns=['reviewText']).reset_index().drop(columns=['index'])
    random_reviews['segmentedSentences'] = random_reviews['reviewText'].apply(
        segment_sent)
    print(
        "5 Randomly selected reviews before and after sentence segmenetation:")
    print(random_reviews)

    ## 3.2.3 Tokenization and Stemming
    print_header('3.2.3 Tokenization and Stemming', 50)

    df['tokenizedSentences'] = df['sentences'].apply(
        lambda sentences: [tokenize(sentence) for sentence in sentences])
    df['tokens'] = df['tokenizedSentences'].apply(flatten)

    ### No Stemming
    print_header('No Stemming', char='-')
    df['words'] = df['tokens'].apply(
        lambda tokens: [token.lower() for token in tokens])
    df['words'] = df['words'].apply(
        lambda tokens: [token for token in tokens if is_word(token)])
    df['uniqueWords'] = df['words'].apply(set)
    df['wordCount'] = df['uniqueWords'].apply(len)

    # token = {normal_word, emoji, stopword, punctuation}
    # word = {normal_word, emoji}

    plot_bar(
        df['wordCount'],
        title=
        'Distribution of Number of Words for Each Review Without Stemming',
        x_label="Word Count",
        y_label="Review Count",
        countplot=False)
    plot_bar(
        df['wordCount'].clip(0, 300),
        title=
        'Distribution of Number of Words for Each Review Without Stemming (Clipped)',
        x_label="Word Count (Clipped)",
        y_label="Review Count",
        countplot=False)

    words = flatten(df['words'])
    words_unique = flatten(df['uniqueWords'])

    top_20_words = pd.DataFrame.from_dict(Counter(words), orient='index').\
                reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\
                sort_values(['Count'], ascending = False).head(20).\
                reset_index().drop(columns = ['index'])

    print_header('Top 20 Words Without Stemming', char='-')
    print(top_20_words)

    ### With Stemming
    print_header('With Stemming', char='-')
    stemmer = SnowballStemmer("english")
    df['stemmedWords'] = df['words'].apply(
        lambda tokens: [stemmer.stem(token) for token in tokens])
    df['uniqueStemmedWords'] = df['stemmedWords'].apply(set)
    df['stemmedWordCount'] = df['uniqueStemmedWords'].apply(len)

    plot_bar(df['stemmedWordCount'], \
            title = 'Distribution of Number of Words for Each Review With Stemming', \
            x_label = "Stemmed Word Count", y_label = "Review Count", countplot = False)
    plot_bar(df['stemmedWordCount'].clip(0, 300), \
            title = 'Distribution of Number of Words for Each Review With Stemming (Clipped)', \
            x_label = "Word Count (Clipped)", y_label = "Review Count", countplot = False)

    plot_bar_overlap(df, ['wordCount', 'stemmedWordCount'], \
            title = 'Distribution of Number of Words for Each Review', \
            x_label = "Word Count", y_label = "Review Count", countplot = False)

    plot_bar_overlap(df[['wordCount', 'stemmedWordCount']].clip(0, 300), ['wordCount', 'stemmedWordCount'], \
            title = 'Distribution of Number of Words for Each Review (Clipped)', \
            x_label = "Word Count", y_label = "Review Count", countplot = False)

    stemmed_words = flatten(df['stemmedWords'])
    stemmed_words_unique = flatten(df['uniqueStemmedWords'])

    top_20_stemmed_words = pd.DataFrame.from_dict(Counter(stemmed_words), orient='index').\
                reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\
                sort_values(['Count'], ascending = False).head(20).\
                reset_index().drop(columns = ['index'])

    print_header('Top 20 Words with Stemming', char='-')
    print(top_20_stemmed_words)

    print_header('3.2.4 POS Tagging', 50)

    tokenized_sentences = pd.Series(flatten(df['tokenizedSentences']))
    print('Total Number of Sentences: ' + str(len(tokenized_sentences)))

    random_5_sentences = tokenized_sentences.sample(5, random_state=seed)
    random_5_df = pd.DataFrame(
        random_5_sentences,
        columns=['sentence']).reset_index().drop(columns=['index'])
    random_5_df['posTagged'] = random_5_df['sentence'].apply(pos_tag)
    print('=' * 30)
    print(random_5_df)
    print('=' * 30)

    # 3.3 Development of a Noun Phrase Summarizer
    print_header('3.3 Development of a Noun Phrase Summarizer', 50)

    df['posTagged'] = df['tokenizedSentences'].apply(
        lambda tokenizedSentences:
        [pos_tag(sentence) for sentence in tokenizedSentences])
    df['nounPhrases'] = df['posTagged'].apply(
        lambda posTagged:
        [np.lower() for np in flatten([extract_NP(tag) for tag in posTagged])])
    df[['reviewText', 'posTagged', 'nounPhrases']].head()

    # Including single noun phrases
    print_header('Including single noun phrases', char='-')
    noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrases'])), orient='index').\
                    reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\
                    sort_values(['Count'], ascending = False)
    top_20_noun_phrases = noun_phrases.head(20).reset_index().drop(
        columns=['index'])

    print_header('Top 20 Noun Phrases Including Single Noun Phrases', char='-')
    print(top_20_noun_phrases)

    df['nounPhrasesExcludeSingle'] = df['nounPhrases'].apply(
        lambda noun_phrases: [
            noun_phrase for noun_phrase in noun_phrases
            if len(noun_phrase.split()) > 1
        ])
    noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrasesExcludeSingle'])), orient='index').\
                    reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\
                    sort_values(['Count'], ascending = False)
    top_20_noun_phrases = noun_phrases.head(20).reset_index().drop(
        columns=['index'])

    print_header('Top 20 Noun Phrases Excluding Single Noun Phrases', char='-')
    print(top_20_noun_phrases)

    products = df['asin'].value_counts().head(3).index
    products_np_top1 = df[df['asin'] == products[0]]
    products_np_top2 = df[df['asin'] == products[1]]
    products_np_top3 = df[df['asin'] == products[2]]

    print_representative_np(products_np_top1, product=products[0], n=30)
    print_representative_np(products_np_top2, product=products[1], n=30)
    print_representative_np(products_np_top3, product=products[2], n=30)

    random_5_reviews = df[['reviewText', 'posTagged',
                           'nounPhrases']].sample(5, random_state=seed)
    random_5_reviews['nounPhrasesLen'] = random_5_reviews['nounPhrases'].apply(
        len)

    print_header('Noun Phrase Detector Evaluation for  Random 5 Reviews',
                 char='-')
    print(random_5_reviews)

    # 3.4. Sentiment Word Detection
    print(
        str(datetime.datetime.now()).split('.')[0] +
        ': Start processing sentence segmentation')

    # Without Stemming and Without Negation
    sentiment_score(df, "./rep_words/ns_nn.csv")

    # With Stemming and Without Negation
    sentiment_score(df, "./rep_words/s_nn.csv", stemmer=stemmer)

    # Without Stemming and With Negation
    sentiment_score(df, "./rep_words/ns_n.csv", convert_neg=True)

    # With Stemming and With Negation
    sentiment_score(df,
                    "./rep_words/s_n.csv",
                    stemmer=stemmer,
                    convert_neg=True)