def extract_aspects(reviews): """ INPUT: iterable of strings (pd Series, list) OUTPUT: list of aspects Return the aspects from the set of reviews """ # import the aspect extraction functions from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents # put all the sentences in all reviews in one stream sentences = [] for review in reviews: sentences.extend(get_sentences(review.encode('utf-8', 'ignore'))) tokenized_sentences = [ tokenize(sentence) for sentence in sentences for sentences in get_sentences(review.encode('utf-8', 'ignore')) ] # tokenize each sentence #tokenized_sentences = [tokenize(sentence) for sentence in sentences] # pos tag each sentence tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences] # from the pos tagged sentences, get a list of aspects aspects = aspects_from_tagged_sents(tagged_sentences) return aspects
def get_sentences_by_aspect(aspect, reviews): """ INPUT: string (aspect), iterable of strings (full reviews) OUTPUT: iterable of strings Given an aspect and a list of reviews, return a list sof all sentences that mention that aspect. """ # THIS CODE IS TOTALLY COPIED FROM MAIN FILE function 'extract_aspects' # TODO: REFACTOR THIS IN AN INTELLIGENT WAY. from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents # get sentences = [] for review in reviews: sentences.extend(get_sentences(review.encode('utf-8', 'ignore'))) # tokenize each sentence tokenized_sentences = [ tokenize(sentence) for sentence in sentences for sentences in get_sentences(review) ] return [sent for sent in tokenized_sentences if aspect in sent]
def extract_aspects(reviews): """ INPUT: iterable of strings (pd Series, list) OUTPUT: list of aspects Return the aspects from the set of reviews """ # import the aspect extraction functions from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents # put all the sentences in all reviews in one stream #sentences = [] #for review in reviews: # sentences.extend(get_sentences(review)) tokenized_sentences = [tokenize(sentence) for sentence in sentences for sentences in get_sentences(review)] # tokenize each sentence #tokenized_sentences = [tokenize(sentence) for sentence in sentences] # pos tag each sentence tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences] # from the pos tagged sentences, get a list of aspects aspects = aspects_from_tagged_sents(tagged_sentences) return aspects
def aspect_opinions(tweets, aspects): from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents sentences = [] for tweet in tweets: sentences.extend(get_sentences(tweets[tweet])) tokenized_sentences = [tokenize(sentence) for sentence in sentences] aspect_to_tweets = {} for tweet in tokenized_sentences: tweet_aspects = extract_aspects(tweet) relevant_aspects = get_relevant_aspects(tweet_aspects, aspects) for p_aspect in relevant_aspects: #value has the relevant aspects. create a dictionary and push the tweets p_aspects = relevant_aspects[p_aspect] for aspect in p_aspects: if aspect in aspect_to_tweets: aspect_to_tweets[aspect].append(tweet) else: aspect_to_tweets[aspect] = [] aspect_to_tweets[aspect].append(tweet) features = {} for aspect, tweets in aspect_to_tweets.items(): features[aspect] = [len(tweets), score_aspect(tweets)] return features
def aspect_opinions(reviews, aspects): from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents sentences = [] for review in reviews : sentences.extend(get_sentences(review)) tokenized_sentences = [tokenize(sentence) for sentence in sentences] return dict([(aspect, score_aspect(tokenized_sentences, aspect)) for aspect in aspects])
def extract_aspects(reviews): from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents sentences = [] for review in reviews: sentences.extend(get_sentences(review)) tokenized_sentences = [tokenize(sentence) for sentence in sentences] tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences] aspects = all_aspects_from_tagged_sents(tagged_sentences) return aspects
def get_sentences_by_aspect(aspect, reviews): """ INPUT: string (aspect), iterable of strings (full reviews) OUTPUT: iterable of strings Given an aspect and a list of reviews, return a list sof all sentences that mention that aspect. """ # THIS CODE IS TOTALLY COPIED FROM MAIN FILE function 'extract_aspects' # TODO: REFACTOR THIS IN AN INTELLIGENT WAY. from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents # get #sentences = [] #for review in reviews: # sentences.extend(get_sentences(review)) # tokenize each sentence tokenized_sentences = [tokenize(sentence) for sentence in sentences for sentences in get_sentences(review)] return [sent for sent in tokenized_sentences if aspect in sent]
def aspect_opinions(reviews, aspects, relevant_features): from extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents sentences = [] for review in reviews : sentences.extend(get_sentences(review)) tokenized_sentences = [tokenize(sentence) for sentence in sentences] scores = [(aspect[1], score_aspect(tokenized_sentences, aspect[0])) for aspect in aspects.items()] aspect_scores = {} for score in scores : klass = relevant_features[score[0]] if klass in aspect_scores : old_count = aspect_scores[klass][0] old_score = aspect_scores[klass][1] new_count = old_count + score[1][0] new_score = ( old_count * old_score + new_count * score[1][1] ) / (old_count + new_count) aspect_scores[klass] = (new_count, new_score) else : aspect_scores[klass] = (score[1][0], score[1][1]) return aspect_scores