def test_get_sentences_fail(): """ Test that get_sentences fails when passed None. """ tokenize(None)
def extract_aspects(reviews): """ INPUT: iterable of strings (pd Series, list) OUTPUT: list of aspects Return the aspects from the set of reviews """ # import the aspect extraction functions from opinion_mining.extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents # put all the sentences in all reviews in one stream #sentences = [] #for review in reviews: # sentences.extend(get_sentences(review)) tokenized_sentences = [ tokenize(sentence) for sentence in [sentences for sentences in get_sentences(reviews)] ] # tokenize each sentence #tokenized_sentences = [tokenize(sentence) for sentence in sentences] # pos tag each sentence tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences] # from the pos tagged sentences, get a list of aspects aspects = aspects_from_tagged_sents(tagged_sentences) return aspects
def test_tokenize(): """" Tests to make sure that the tokenizer works """ sentence = "This is a test SENTENCE" tokens = tokenize(sentence) for tok in tokens: assert tok.islower() assert len(tokens) == 5 assert isinstance(tokens, list)
def get_sentences_by_aspect(aspect, reviews): """ INPUT: string (aspect), iterable of strings (full reviews) OUTPUT: iterable of strings Given an aspect and a list of reviews, return a list sof all sentences that mention that aspect. """ # THIS CODE IS TOTALLY COPIED FROM MAIN FILE function 'extract_aspects' # TODO: REFACTOR THIS IN AN INTELLIGENT WAY. from opinion_mining.extract_aspects import get_sentences, tokenize, pos_tag, aspects_from_tagged_sents # get ''' sentences = [] if type(reviews)==str: sentences.append(reviews) else: for review in reviews: sentences.extend(get_sentences(review)) ''' sentences = get_sentences(reviews) # tokenize each sentence tokenized_sentences = [tokenize(sentence) for sentence in sentences] sent_set = [] for i in aspect: for j in tokenized_sentences: if i in j: if i not in sent_set: sent_set.append(j) return sent_set