Beispiel #1
0
def get_review_keywords(filename, max_nb_reviews=99, extract_keywords=True, concat_to_extract=True, query = False):
    """return a string of concatenation of
    certain number (default 99) reviews 
    and a set of keywords extracted by AlchemyAPI
    """
    # load file
    if not os.path.isfile(filename):
        return [], '', '', []
    with open(filename) as infile:
        data = json.load(infile)
    print filename + ' processing'
    # extract reviews, if field not exist, None type is returned
    reviews_raw = data.get('reviews')
    description = data.get('description')

    if reviews_raw is None or len(reviews_raw) == 0:
        return description, '', []
    # we are only interested in 'body' filed of reviews
    reviews_raw = [i.get('body') for i in reviews_raw]
    # remove duplicate reviews
    reviews_raw = list(set(reviews_raw))
    # determine how many reviews are going to used
    nb_reviews = min(max_nb_reviews, len(reviews_raw))
    # concatenation of reviews into a single string splited by return
    if concat_to_extract:
        reviews = ['\n'.join(reviews_raw[:nb_reviews])]
    else:
        reviews = reviews_raw[:nb_reviews]
    if not extract_keywords:
        return description, '\n'.join(reviews), []
    keywords = []
    entities = []
    for review in reviews:
        # extract entities
        response_entities = alchemyapi.entities("text", review)
        if response_entities is not None and response_entities.get('entities') is not None:
            entities.extend([i.get('text') for i in response_entities.get('entities')])
        # extract keywords
        response_keywords = alchemyapi.keywords("text", review)
        if response_keywords is not None and response_keywords.get('keywords') is not None:
            l=[]
            for i in response_keywords.get('keywords'):
                l=process_kw(i.get('text'))+l
            keywords=l+keywords
    #Processing the text of reviews - removing the upper case letters,
    # And the punctuation except for the '
    reviews=[process_r(review) for review in reviews]
    return description, '\n'.join(reviews), list(set(keywords) - set(entities))
Beispiel #2
0
def build_corpus(filenames, max_nb_reviews = 99, extract_keywords = True, concat_to_extract = True, query = False):
    """return a corpus and a set of keywords extracted by AlchemyAPI
    filenames is a list of string who are paths of json data files
    """
    #Load the dictionary file, containing the words not to be taken into account   

    vocabulary = []
    reviews = []
    descriptions = []
    for filename in filenames:
        d, r, v = get_review_keywords(filename, max_nb_reviews, extract_keywords,
                                                  concat_to_extract, query)
        reviews.append(process_r(r))
        descriptions.append(d)
        for i in v:
            vocabulary.append(i)
    vocabulary = list(set(vocabulary))
    return descriptions, reviews, vocabulary