def create_vocabulary(src_file_path, dst_file_path, repetitions_thershold=-1): exceptions = 0 with open(src_file_path, 'r') as file: word_counter = dict() counter = 1 # for line in file: line_contents = json.loads(line) try: for word in str( NLP_Utils.remove_non_letters( line_contents['text'])).split(): word = NLP_Utils.stem_word(word) word_counter[word] = word_counter.get(word, 0) + 1 except Exception as e: print e exceptions += 1 counter += 1 with open(dst_file_path, 'w') as file: for k, v in word_counter.items(): if repetitions_thershold == -1: file.write(k + ',' + str(v) + '\n') else: if v > repetitions_thershold: file.write(k + ',' + str(v) + '\n') print 'number of exceptions: ', exceptions
def remove_stopwords(text): text = text.split() for i, word in enumerate(text): word = NLP_Utils.stem_word(word) if NLP_Utils.is_word_stop_word(word): text[i] = word return ' '.join(text)
def refresh_vocabulary(): #mongo_url = 'mongodb://84.108.189.236:27017' mongo_url = 'mongodb://193.106.55.77:27017' data_path = "tagged_data.json" get_tagged_data(mongo_url, data_path) NLP_Utils.create_vocabulary(data_path, 'text_to_vector_vocabulary.txt', repetitions_thershold=10) NLP_Utils.json_stats_counter(data_path)
def create_words_polarity_vocabulary(positive_path, negative_path, tgt_path): final_list = dict() with open(positive_path) as pos_file: for line in pos_file: final_list[NLP_Utils.stem_word(line.rstrip('\n'))] = 1 with open(negative_path) as neg_file: for line in neg_file: final_list[NLP_Utils.stem_word(line.rstrip('\n'))] = -1 with open(tgt_path, 'w') as tgt_file: for k, v in final_list.iteritems(): tgt_file.write(k + ',' + str(v) + '\n')
def text_2_binary_vector(text, vocabulary): hot_vector = [0] * len(vocabulary) word_counter = dict() words_not_in_vocabulary = 0 for word in str(NLP_Utils.remove_non_letters(text)).split(): word = NLP_Utils.stem_word(word) word_counter[word] = word_counter.get(word, 0) + 1 for word, value in word_counter.iteritems(): if vocabulary.has_key(word): hot_vector[vocabulary[word].get_index()] = 1 else: words_not_in_vocabulary += 1 return hot_vector
def text_2_repetitions_bi_vector(text, vocabulary): hot_vector = [0] * len(vocabulary) word_counter = dict() words_not_in_vocabulary = 0 prev_word = None for word in str(NLP_Utils.remove_non_letters(text)).split(): word = NLP_Utils.stem_word(word) if prev_word is not None: word_counter[prev_word + '-' + word] = word_counter.get( prev_word + '-' + word, 0) + 1 for word, value in word_counter.iteritems(): if vocabulary.has_key(word): hot_vector[vocabulary[word].get_index()] = float( word_counter[word]) else: words_not_in_vocabulary += 1 return hot_vector
def calc_pos_rank(text): nouns_count = 0 adjectives_count = 0 verbs_count = 0 pos_tags = NLP_Utils.text_2_part_of_speech_tag(text) for word_token in pos_tags: if word_token[1][:2] == 'NN': nouns_count += 1 elif word_token[1][:2] == 'JJ': adjectives_count += 1 elif word_token[1][:2] == 'VB': adjectives_count += 1 return adjectives_count, nouns_count, verbs_count
def get_surrounding_words(text, core_words, words_before=3, words_after=7): reps = 0 surrounding_words = [] text = str(NLP_Utils.remove_non_letters(text)).split() for i, word in enumerate(text): word = NLP_Utils.stem_word(word) if word in core_words: reps += 1 if i < words_before: start_index = 0 else: start_index = i - words_before if i + words_after >= len(text): end_index = len(text) else: end_index = i + words_after surrounding_words.extend(text[start_index:end_index]) return " ".join(surrounding_words), reps
def count_polarity_words(text, vocabulary): positive = 0 negative = 0 text = text.split() for word in text: word = NLP_Utils.stem_word(word) if word in vocabulary: if vocabulary[word] == 1: positive += 1 else: negative += 1 return positive, negative, positive - negative
def __init__(self, homonyms=None, synonyms=None, stopwords=None): self.cal_to_J = 4.184 self.J_to_cal = 1 / 4.184 #food facts self.medianfacts = pd.read_parquet('medianfacts.parquet') self.allmedianfacts_subset = pd.read_parquet('allmedianfacts.parquet') #it would be better to create a class ProductMatcher and define its homonym/synonym dictionaries, and process them once self.homonyms = {'pomme': ['pomme de terre'], 'poire': ['poireau']} self.synonyms = { 'levure': ['levain'], 'lait': ['boisson lactée'], 'patate': ['pomme de terre'] } self.stopwords = [ 'de', 'du', 'le', 'les', 'aux', 'la', 'des', 'a', 'une', 'un', 'au', 'g', 'gr', 'gramme', 'grammes', 'kg', 'kilo', 'kilos', 'kilogramme', 'kilogrammes', 'd', 'l' ] if homonyms is not None: self.homonyms = homonyms if synonyms is not None: self.synonyms = synonyms if stopwords is not None: self.stopwords = stopwords self.matcher = NU.SequenceMatcher(homonyms=self.homonyms, synonyms=self.synonyms, stopwords=self.stopwords) self.product_similarity = self.matcher.sequence_similarity self.verbosity = False self.warnings = True
import NLP_Utils import json a = raw_input('function:') if str(1) == a: voc = NLP_Utils.read_vocabulary('text_to_vector_vocabulary.txt') voc_list = [None] * len(voc) for k,v in voc.iteritems(): voc_list[v.get_index()] = k while True: print voc_list[int(input("index: "))] elif str(2) == a: src_path = 'tagged_data.json' target_field = 'quality_of_service_rank' category = '1' with open(src_path, 'r') as src_file: data_dict = {} filtered_count = 0 data = list() target = list() # Runns on each line - which supposed to be a doc for line in src_file: add_line = False current_json = json.loads(line) if current_json.has_key(target_field):
def valeurs_nutritionnelles(self, query, quiet=None, th=0.3, pretty=False): """ query: string Note: ça dépend de nombreuses variables globales pour le moment! quiet: opposé de self.verbosity par défaut (si l'argument est laissé à None) """ cal_to_J = 4.184 J_to_cal = 1 / cal_to_J medianfacts = self.medianfacts allmedianfacts_subset = self.allmedianfacts_subset stopwords = self.stopwords product_similarity = self.product_similarity queryst = NU.preprocess_string(query, stopwords) queryproduct = medianfacts[medianfacts.index == queryst] if quiet is None: quiet = not self.verbosity if len(queryproduct) == 0: #rare product ? queryproduct = allmedianfacts_subset[allmedianfacts_subset.index == queryst] if len(queryproduct) > 0: if not quiet: print(query, "is an unusual product in the dataset") #else: if 0: #products starting with the query ? queryproduct = allmedianfacts_subset[ allmedianfacts_subset.index.str.startswith(queryst)] if len(queryproduct) > 0: if not quiet: print("no exact matching of", query, "in the dataset. Retrieved names starting by ", query) qproducts = allmedianfacts_subset[ allmedianfacts_subset.index.str.startswith(queryst)] names = qproducts.product_name.to_list() if len(names) > 0: if not quiet: print(names) if len(queryproduct) == 0: if not quiet: print( query, "no match found in the dataset, getting most similar product name" ) #sims = medianfacts.product_name.apply(lambda x: similarity(x,query)).rename('sim') sims = medianfacts.product_name.apply( lambda x: product_similarity(x, query)).rename('sim') max_sim_idx = sims.idxmax() #print('max_sim:', max_sim_idx, sims[max_sim_idx]) if sims[max_sim_idx] > th: queryproduct = medianfacts.loc[[max_sim_idx]] if len(queryproduct) == 0: if not quiet: print( query, "no match found in the usual dataset, getting most similar product name" ) #sims = allmedianfacts_subset.product_name.apply(lambda x: similarity(x,query)).rename('sim') sims = allmedianfacts_subset.product_name.apply( lambda x: product_similarity(x, query)).rename('sim') max_sim_idx = sims.idxmax() #print('max_sim:', max_sim_idx, sims[max_sim_idx]) if sims[max_sim_idx] > th: queryproduct = allmedianfacts_subset.loc[[max_sim_idx]] if len(queryproduct) > 0: if not quiet: print(query) print('(one of the numerous) product name:', queryproduct.product_name[0]) print('kCal/100g:', queryproduct.energy_100g[0] * self.J_to_cal) print('fiber/100g:', queryproduct.fiber_100g[0]) print('fat/100g:', queryproduct.fat_100g[0]) print('saturated-fat/100g:', queryproduct['saturated-fat_100g'][0]) print('carbohydrates/100g:', queryproduct.carbohydrates_100g[0]) print('sugar/100g:', queryproduct.sugars_100g[0]) print('protein/100g:', queryproduct.proteins_100g[0]) print('salt/100g:', queryproduct.salt_100g[0]) print('sodium/100g:', queryproduct.sodium_100g[0]) print('additives:', queryproduct.additives_n[0]) print('ingredients from palm oil:', queryproduct.ingredients_from_palm_oil_n[0]) print('ingredients maybe from palm oil:', queryproduct.ingredients_that_may_be_from_palm_oil_n[0]) print('nutrition score:', queryproduct['nutrition-score-fr_100g'][0]) print('nutrition grade:', queryproduct['nutriscore_grade'][0]) print('nova score:', queryproduct['nova_group'][0]) if 'unit_weight_estimate' in queryproduct.columns: print('unit weight estimate (median):', queryproduct['unit_weight_estimate'][0]) if 'unit_weight_estimate2' in queryproduct.columns: print('unit weight estimate 2 (most probable value):', queryproduct['unit_weight_estimate2'][0]) if not quiet: print('\n') if pretty: queryproduct.insert(1, 'kCal/100g', queryproduct.energy_100g * self.J_to_cal, True) if 'unit_weight_estimate' in queryproduct.columns: queryproduct = queryproduct[[ 'product_name', 'kCal/100g', 'fiber_100g', 'fat_100g', 'saturated-fat_100g', 'sugars_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'additives_n', 'ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 'nutrition-score-fr_100g', 'nova_group', 'unit_weight_estimate' ]] queryproduct.columns = [ 'Nom', 'kCal/100g', 'Fibres/100g', 'Graisses/100g', 'Gaisses saturées/100g', 'Sucre/100g', 'Protéines/100g', 'Sel/100g', 'Sodium/100g', "Additifs", "Ingrédients dérivés de l'huile de palme", "Ingrédients potentiellement dérivés de l'huile de palme", "Nutriscore", "Nova-group", 'Weight (estimated)' ] else: queryproduct = queryproduct[[ 'product_name', 'kCal/100g', 'fiber_100g', 'fat_100g', 'saturated-fat_100g', 'sugars_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'additives_n', 'ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 'nutrition-score-fr_100g', 'nova_group' ]] queryproduct.columns = [ 'Nom', 'kCal/100g', 'Fibres/100g', 'Graisses/100g', 'Gaisses saturées/100g', 'Sucre/100g', 'Protéines/100g', 'Sel/100g', 'Sodium/100g', "Additifs", "Ingrédients dérivés de l'huile de palme", "Ingrédients potentiellement dérivés de l'huile de palme", "Nutriscore", "Nova-group" ] queryproduct = queryproduct.transpose() queryproduct.columns.name = "Produit" return queryproduct
def units_to_grams(self, recette, nb_pers=None, no_delay=False): """ """ if nb_pers is None: Warning( "Vous n'avez pas spécifié pour combien de personnes est la recette. Une valeur par défaut de 1 personne sera utilisée. Pour éviter cet avertissement, définissez l'attibut 'warnings' de cette clase à False, ou spécifiez un nombre de personnes" ) nb_pers = 1 if type(recette) is dict: recettedf = pd.DataFrame(recette) elif type(recette) is pd.core.frame.DataFrame: recettedf = recette.copy() else: raise Exception( "'recette' doit être soit de type dict, soit pandas.core.frame.DataFrame" ) #Pour les ingrédients donnés à l'unité, estimer le poids unitaire standard uindexes = recettedf.unit == 'u' queryingr = list(recettedf.ingredients[uindexes]) recettedf.index = recettedf.ingredients if len(queryingr) > 0: weights = [] for q in queryingr: w = np.nan #extract unit weights from dataset if q in self.medianfacts.product_name: w = self.medianfacts.loc[q].unit_weight_estimate #query unit weight if not in dataset if np.isnan(w): out = NU.query_food_weight(q, no_delay=no_delay) if len(out) == 0: print( "unit_to_grams: no weight found, conversion cannot be performed." ) return recettedf #garder la valeur la plus probable w = out[-4] weights.append(w) nbs = list(recettedf.loc[queryingr, 'qty']) try: nbs = [float(n) for n in nbs] except: raise Exception("Veuillez entrer des quantités valides") weights = [w * n for w, n in zip(weights, nbs)] recettedf.loc[queryingr, 'qty'] = weights recettedf.unit = 'g' recettedf.qty = recettedf.qty / nb_pers return recettedf