def get_review_keywords(filename, max_nb_reviews=99, extract_keywords=True, concat_to_extract=True, query = False): """return a string of concatenation of certain number (default 99) reviews and a set of keywords extracted by AlchemyAPI """ # load file if not os.path.isfile(filename): return [], '', '', [] with open(filename) as infile: data = json.load(infile) print filename + ' processing' # extract reviews, if field not exist, None type is returned reviews_raw = data.get('reviews') description = data.get('description') if reviews_raw is None or len(reviews_raw) == 0: return description, '', [] # we are only interested in 'body' filed of reviews reviews_raw = [i.get('body') for i in reviews_raw] # remove duplicate reviews reviews_raw = list(set(reviews_raw)) # determine how many reviews are going to used nb_reviews = min(max_nb_reviews, len(reviews_raw)) # concatenation of reviews into a single string splited by return if concat_to_extract: reviews = ['\n'.join(reviews_raw[:nb_reviews])] else: reviews = reviews_raw[:nb_reviews] if not extract_keywords: return description, '\n'.join(reviews), [] keywords = [] entities = [] for review in reviews: # extract entities response_entities = alchemyapi.entities("text", review) if response_entities is not None and response_entities.get('entities') is not None: entities.extend([i.get('text') for i in response_entities.get('entities')]) # extract keywords response_keywords = alchemyapi.keywords("text", review) if response_keywords is not None and response_keywords.get('keywords') is not None: l=[] for i in response_keywords.get('keywords'): l=process_kw(i.get('text'))+l keywords=l+keywords #Processing the text of reviews - removing the upper case letters, # And the punctuation except for the ' reviews=[process_r(review) for review in reviews] return description, '\n'.join(reviews), list(set(keywords) - set(entities))
def build_corpus(filenames, max_nb_reviews = 99, extract_keywords = True, concat_to_extract = True, query = False): """return a corpus and a set of keywords extracted by AlchemyAPI filenames is a list of string who are paths of json data files """ #Load the dictionary file, containing the words not to be taken into account vocabulary = [] reviews = [] descriptions = [] for filename in filenames: d, r, v = get_review_keywords(filename, max_nb_reviews, extract_keywords, concat_to_extract, query) reviews.append(process_r(r)) descriptions.append(d) for i in v: vocabulary.append(i) vocabulary = list(set(vocabulary)) return descriptions, reviews, vocabulary