Ejemplo n.º 1
0
def create_vocabulary(src_file_path, dst_file_path, repetitions_thershold=-1):
    exceptions = 0

    with open(src_file_path, 'r') as file:
        word_counter = dict()

        counter = 1
        #
        for line in file:
            line_contents = json.loads(line)

            try:
                for word in str(
                        NLP_Utils.remove_non_letters(
                            line_contents['text'])).split():
                    word = NLP_Utils.stem_word(word)
                    word_counter[word] = word_counter.get(word, 0) + 1
            except Exception as e:
                print e
                exceptions += 1

            counter += 1

    with open(dst_file_path, 'w') as file:
        for k, v in word_counter.items():
            if repetitions_thershold == -1:
                file.write(k + ',' + str(v) + '\n')
            else:
                if v > repetitions_thershold:
                    file.write(k + ',' + str(v) + '\n')

    print 'number of exceptions: ', exceptions
Ejemplo n.º 2
0
def remove_stopwords(text):
    text = text.split()

    for i, word in enumerate(text):
        word = NLP_Utils.stem_word(word)
        if NLP_Utils.is_word_stop_word(word):
            text[i] = word

    return ' '.join(text)
Ejemplo n.º 3
0
def refresh_vocabulary():
    #mongo_url = 'mongodb://84.108.189.236:27017'
    mongo_url = 'mongodb://193.106.55.77:27017'
    data_path = "tagged_data.json"

    get_tagged_data(mongo_url, data_path)
    NLP_Utils.create_vocabulary(data_path,
                                'text_to_vector_vocabulary.txt',
                                repetitions_thershold=10)
    NLP_Utils.json_stats_counter(data_path)
Ejemplo n.º 4
0
def create_words_polarity_vocabulary(positive_path, negative_path, tgt_path):
    final_list = dict()

    with open(positive_path) as pos_file:
        for line in pos_file:
            final_list[NLP_Utils.stem_word(line.rstrip('\n'))] = 1

    with open(negative_path) as neg_file:
        for line in neg_file:
            final_list[NLP_Utils.stem_word(line.rstrip('\n'))] = -1

    with open(tgt_path, 'w') as tgt_file:
        for k, v in final_list.iteritems():
            tgt_file.write(k + ',' + str(v) + '\n')
Ejemplo n.º 5
0
def text_2_binary_vector(text, vocabulary):
    hot_vector = [0] * len(vocabulary)
    word_counter = dict()
    words_not_in_vocabulary = 0

    for word in str(NLP_Utils.remove_non_letters(text)).split():
        word = NLP_Utils.stem_word(word)
        word_counter[word] = word_counter.get(word, 0) + 1

    for word, value in word_counter.iteritems():
        if vocabulary.has_key(word):
            hot_vector[vocabulary[word].get_index()] = 1
        else:
            words_not_in_vocabulary += 1

    return hot_vector
Ejemplo n.º 6
0
def text_2_repetitions_bi_vector(text, vocabulary):
    hot_vector = [0] * len(vocabulary)
    word_counter = dict()
    words_not_in_vocabulary = 0
    prev_word = None

    for word in str(NLP_Utils.remove_non_letters(text)).split():
        word = NLP_Utils.stem_word(word)

        if prev_word is not None:
            word_counter[prev_word + '-' + word] = word_counter.get(
                prev_word + '-' + word, 0) + 1

    for word, value in word_counter.iteritems():
        if vocabulary.has_key(word):
            hot_vector[vocabulary[word].get_index()] = float(
                word_counter[word])
        else:
            words_not_in_vocabulary += 1

    return hot_vector
Ejemplo n.º 7
0
def calc_pos_rank(text):
    nouns_count = 0
    adjectives_count = 0
    verbs_count = 0
    pos_tags = NLP_Utils.text_2_part_of_speech_tag(text)

    for word_token in pos_tags:
        if word_token[1][:2] == 'NN':
            nouns_count += 1
        elif word_token[1][:2] == 'JJ':
            adjectives_count += 1
        elif word_token[1][:2] == 'VB':
            adjectives_count += 1

    return adjectives_count, nouns_count, verbs_count
Ejemplo n.º 8
0
def get_surrounding_words(text, core_words, words_before=3, words_after=7):
    reps = 0

    surrounding_words = []
    text = str(NLP_Utils.remove_non_letters(text)).split()

    for i, word in enumerate(text):
        word = NLP_Utils.stem_word(word)

        if word in core_words:
            reps += 1
            if i < words_before:
                start_index = 0
            else:
                start_index = i - words_before

            if i + words_after >= len(text):
                end_index = len(text)
            else:
                end_index = i + words_after

            surrounding_words.extend(text[start_index:end_index])

    return " ".join(surrounding_words), reps
Ejemplo n.º 9
0
def count_polarity_words(text, vocabulary):
    positive = 0
    negative = 0

    text = text.split()

    for word in text:
        word = NLP_Utils.stem_word(word)

        if word in vocabulary:
            if vocabulary[word] == 1:
                positive += 1
            else:
                negative += 1

    return positive, negative, positive - negative
Ejemplo n.º 10
0
    def __init__(self, homonyms=None, synonyms=None, stopwords=None):

        self.cal_to_J = 4.184
        self.J_to_cal = 1 / 4.184

        #food facts
        self.medianfacts = pd.read_parquet('medianfacts.parquet')
        self.allmedianfacts_subset = pd.read_parquet('allmedianfacts.parquet')

        #it would be better to create a class ProductMatcher and define its homonym/synonym dictionaries, and process them once
        self.homonyms = {'pomme': ['pomme de terre'], 'poire': ['poireau']}
        self.synonyms = {
            'levure': ['levain'],
            'lait': ['boisson lactée'],
            'patate': ['pomme de terre']
        }
        self.stopwords = [
            'de', 'du', 'le', 'les', 'aux', 'la', 'des', 'a', 'une', 'un',
            'au', 'g', 'gr', 'gramme', 'grammes', 'kg', 'kilo', 'kilos',
            'kilogramme', 'kilogrammes', 'd', 'l'
        ]
        if homonyms is not None:
            self.homonyms = homonyms

        if synonyms is not None:
            self.synonyms = synonyms

        if stopwords is not None:
            self.stopwords = stopwords

        self.matcher = NU.SequenceMatcher(homonyms=self.homonyms,
                                          synonyms=self.synonyms,
                                          stopwords=self.stopwords)

        self.product_similarity = self.matcher.sequence_similarity

        self.verbosity = False

        self.warnings = True
import NLP_Utils
import json

a = raw_input('function:')

if str(1) == a:
    voc = NLP_Utils.read_vocabulary('text_to_vector_vocabulary.txt')

    voc_list = [None] * len(voc)
    for k,v in voc.iteritems():
        voc_list[v.get_index()] = k

    while True:
        print voc_list[int(input("index: "))]
elif str(2) == a:
    src_path = 'tagged_data.json'
    target_field = 'quality_of_service_rank'
    category = '1'


    with open(src_path, 'r') as src_file:
        data_dict = {}

        filtered_count = 0
        data = list()
        target = list()
        # Runns on each line - which supposed to be a doc
        for line in src_file:
            add_line = False
            current_json = json.loads(line)
            if current_json.has_key(target_field):
Ejemplo n.º 12
0
    def valeurs_nutritionnelles(self, query, quiet=None, th=0.3, pretty=False):
        """
        query: string
        Note: ça dépend de nombreuses variables globales pour le moment!
        quiet: opposé de self.verbosity par défaut (si l'argument est laissé à None)
        """
        cal_to_J = 4.184
        J_to_cal = 1 / cal_to_J

        medianfacts = self.medianfacts
        allmedianfacts_subset = self.allmedianfacts_subset
        stopwords = self.stopwords
        product_similarity = self.product_similarity

        queryst = NU.preprocess_string(query, stopwords)
        queryproduct = medianfacts[medianfacts.index == queryst]

        if quiet is None:
            quiet = not self.verbosity

        if len(queryproduct) == 0:

            #rare product ?
            queryproduct = allmedianfacts_subset[allmedianfacts_subset.index ==
                                                 queryst]

            if len(queryproduct) > 0:
                if not quiet:
                    print(query, "is an unusual product in the dataset")
            #else:
            if 0:

                #products starting with the query ?
                queryproduct = allmedianfacts_subset[
                    allmedianfacts_subset.index.str.startswith(queryst)]
                if len(queryproduct) > 0:
                    if not quiet:
                        print("no exact matching of", query,
                              "in the dataset. Retrieved names starting by ",
                              query)
                    qproducts = allmedianfacts_subset[
                        allmedianfacts_subset.index.str.startswith(queryst)]
                    names = qproducts.product_name.to_list()
                    if len(names) > 0:
                        if not quiet: print(names)

        if len(queryproduct) == 0:
            if not quiet:
                print(
                    query,
                    "no match found in the dataset, getting most similar product name"
                )

            #sims = medianfacts.product_name.apply(lambda x: similarity(x,query)).rename('sim')
            sims = medianfacts.product_name.apply(
                lambda x: product_similarity(x, query)).rename('sim')
            max_sim_idx = sims.idxmax()
            #print('max_sim:', max_sim_idx, sims[max_sim_idx])

            if sims[max_sim_idx] > th:
                queryproduct = medianfacts.loc[[max_sim_idx]]

            if len(queryproduct) == 0:
                if not quiet:
                    print(
                        query,
                        "no match found in the usual dataset, getting most similar product name"
                    )

                #sims = allmedianfacts_subset.product_name.apply(lambda x: similarity(x,query)).rename('sim')
                sims = allmedianfacts_subset.product_name.apply(
                    lambda x: product_similarity(x, query)).rename('sim')
                max_sim_idx = sims.idxmax()
                #print('max_sim:', max_sim_idx, sims[max_sim_idx])

                if sims[max_sim_idx] > th:
                    queryproduct = allmedianfacts_subset.loc[[max_sim_idx]]

        if len(queryproduct) > 0:
            if not quiet:
                print(query)
                print('(one of the numerous) product name:',
                      queryproduct.product_name[0])
                print('kCal/100g:',
                      queryproduct.energy_100g[0] * self.J_to_cal)
                print('fiber/100g:', queryproduct.fiber_100g[0])
                print('fat/100g:', queryproduct.fat_100g[0])
                print('saturated-fat/100g:',
                      queryproduct['saturated-fat_100g'][0])
                print('carbohydrates/100g:',
                      queryproduct.carbohydrates_100g[0])
                print('sugar/100g:', queryproduct.sugars_100g[0])
                print('protein/100g:', queryproduct.proteins_100g[0])
                print('salt/100g:', queryproduct.salt_100g[0])
                print('sodium/100g:', queryproduct.sodium_100g[0])
                print('additives:', queryproduct.additives_n[0])
                print('ingredients from palm oil:',
                      queryproduct.ingredients_from_palm_oil_n[0])
                print('ingredients maybe from palm oil:',
                      queryproduct.ingredients_that_may_be_from_palm_oil_n[0])

                print('nutrition score:',
                      queryproduct['nutrition-score-fr_100g'][0])
                print('nutrition grade:', queryproduct['nutriscore_grade'][0])
                print('nova score:', queryproduct['nova_group'][0])
                if 'unit_weight_estimate' in queryproduct.columns:
                    print('unit weight estimate (median):',
                          queryproduct['unit_weight_estimate'][0])
                if 'unit_weight_estimate2' in queryproduct.columns:
                    print('unit weight estimate 2 (most probable value):',
                          queryproduct['unit_weight_estimate2'][0])

        if not quiet: print('\n')

        if pretty:
            queryproduct.insert(1, 'kCal/100g',
                                queryproduct.energy_100g * self.J_to_cal, True)
            if 'unit_weight_estimate' in queryproduct.columns:
                queryproduct = queryproduct[[
                    'product_name', 'kCal/100g', 'fiber_100g', 'fat_100g',
                    'saturated-fat_100g', 'sugars_100g', 'proteins_100g',
                    'salt_100g', 'sodium_100g', 'additives_n',
                    'ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n',
                    'nutrition-score-fr_100g', 'nova_group',
                    'unit_weight_estimate'
                ]]
                queryproduct.columns = [
                    'Nom', 'kCal/100g', 'Fibres/100g', 'Graisses/100g',
                    'Gaisses saturées/100g', 'Sucre/100g', 'Protéines/100g',
                    'Sel/100g', 'Sodium/100g', "Additifs",
                    "Ingrédients dérivés de l'huile de palme",
                    "Ingrédients potentiellement dérivés de l'huile de palme",
                    "Nutriscore", "Nova-group", 'Weight (estimated)'
                ]
            else:
                queryproduct = queryproduct[[
                    'product_name', 'kCal/100g', 'fiber_100g', 'fat_100g',
                    'saturated-fat_100g', 'sugars_100g', 'proteins_100g',
                    'salt_100g', 'sodium_100g', 'additives_n',
                    'ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n',
                    'nutrition-score-fr_100g', 'nova_group'
                ]]
                queryproduct.columns = [
                    'Nom', 'kCal/100g', 'Fibres/100g', 'Graisses/100g',
                    'Gaisses saturées/100g', 'Sucre/100g', 'Protéines/100g',
                    'Sel/100g', 'Sodium/100g', "Additifs",
                    "Ingrédients dérivés de l'huile de palme",
                    "Ingrédients potentiellement dérivés de l'huile de palme",
                    "Nutriscore", "Nova-group"
                ]
            queryproduct = queryproduct.transpose()
            queryproduct.columns.name = "Produit"
        return queryproduct
Ejemplo n.º 13
0
    def units_to_grams(self, recette, nb_pers=None, no_delay=False):
        """
        
        """

        if nb_pers is None:
            Warning(
                "Vous n'avez pas spécifié pour combien de personnes est la recette. Une valeur par défaut de 1 personne sera utilisée. Pour éviter cet avertissement, définissez l'attibut 'warnings' de cette clase à False, ou spécifiez un nombre de personnes"
            )
            nb_pers = 1

        if type(recette) is dict:
            recettedf = pd.DataFrame(recette)
        elif type(recette) is pd.core.frame.DataFrame:
            recettedf = recette.copy()
        else:
            raise Exception(
                "'recette' doit être soit de type dict, soit pandas.core.frame.DataFrame"
            )

        #Pour les ingrédients donnés à l'unité, estimer le poids unitaire standard
        uindexes = recettedf.unit == 'u'
        queryingr = list(recettedf.ingredients[uindexes])
        recettedf.index = recettedf.ingredients

        if len(queryingr) > 0:

            weights = []

            for q in queryingr:

                w = np.nan
                #extract unit weights from dataset
                if q in self.medianfacts.product_name:
                    w = self.medianfacts.loc[q].unit_weight_estimate

                #query unit weight if not in dataset
                if np.isnan(w):

                    out = NU.query_food_weight(q, no_delay=no_delay)

                    if len(out) == 0:
                        print(
                            "unit_to_grams: no weight found, conversion cannot be performed."
                        )
                        return recettedf

                    #garder la valeur la plus probable
                    w = out[-4]

                weights.append(w)

            nbs = list(recettedf.loc[queryingr, 'qty'])

            try:
                nbs = [float(n) for n in nbs]
            except:
                raise Exception("Veuillez entrer des quantités valides")

            weights = [w * n for w, n in zip(weights, nbs)]

            recettedf.loc[queryingr, 'qty'] = weights

        recettedf.unit = 'g'

        recettedf.qty = recettedf.qty / nb_pers

        return recettedf