def crawl_studies_facts(survey_field, facts_d): bulk_data = [] count = 0 total_count = 0 facts_df = DataFrame.from_dict(facts_d, orient='index') facts_df['blindcode'] = [ix[0] for ix in facts_df.index] facts_df['fact'] = [ix[1] for ix in facts_df.index] facts_df['answer'] = [ix[2] for ix in facts_df.index] for blindcode, facts_blindcode_df in facts_df.groupby( facts_df['blindcode']): se = models.StudiesMap() se.cft_id = blindcode se.dataset = survey_field se.ingr_name = blindcode se.IPC = blindcode percentile = {} for idx, fact_s in facts_blindcode_df.iterrows(): fact = fact_s['fact'] answer = fact_s['answer'] #se.supplier = "CI" #se.olfactive = cft_s.olfactive #se.region = cft_s.region #se.review = cft_s.review #se.dilution = cft_s.dilution #se.intensity = cft_s.intensity if fact not in percentile.keys(): percentile[fact] = [] val = answer prc = fact_s[0] if prc > 0: percentile[fact].append((val, prc)) for fact in percentile.keys(): if fact == 'emotion': se.emotion = percentile[fact] if fact == 'suitable_stage': se.suitable_stage = percentile[fact] if fact == 'hedonics': se.hedonics = percentile[fact] if fact == 'freshness': se.freshness = percentile[fact] data = elastic.convert_for_bulk(se, 'update') bulk_data.append(data) count = count + 1 if count > 100: bulk(models.client, actions=bulk_data, stats_only=True) total_count = total_count + count print( "crawl_studies_facts: written another batch, total written {0:d}" .format(total_count)) bulk_data = [] count = 1 bulk(models.client, actions=bulk_data, stats_only=True) pass
def load_studies_facts(survey_field, facts_d): es_host = ES_HOSTS[0] headers = {} if 'http_auth' in es_host: headers['http_auth'] = es_host['http_auth'] host = es_host['host'] index = 'studies' doc_type = 'studies' url = "http://" + host + ":9200/" + index bulk_data = [] count = 0 total_count = 0 facts_df = DataFrame.from_dict(facts_d, orient='index') facts_df['blindcode'] = [ix[0] for ix in facts_df.index] facts_df['fact'] = [ix[1] for ix in facts_df.index] facts_df['answer'] = [ix[2] for ix in facts_df.index] for blindcode, facts_blindcode_df in facts_df.groupby( facts_df['blindcode']): se = models.StudiesMap() se.cft_id = blindcode.split('-')[0] se.survey = survey_field se.blindcode = blindcode percentile = {} doc = None doc = {} doc['_id'] = blindcode.split('-')[0] doc['cft_id'] = blindcode.split('-')[0] doc['survey'] = survey_field doc['blindcode'] = blindcode for idx, fact_s in facts_blindcode_df.iterrows(): fact = fact_s['fact'] answer = fact_s['answer'] #se.supplier = "CI" #se.olfactive = cft_s.olfactive #se.region = cft_s.region #se.review = cft_s.review #se.dilution = cft_s.dilution #se.intensity = cft_s.intensity if fact not in percentile.keys(): percentile[fact] = [] val = str(answer) prc = fact_s[0] if prc > 0 and val != 'Total': #percentile[fact].append((val, prc)) percentile[fact].append({'val': val, 'prc': prc}) for fact in percentile.keys(): if fact == 'emotion': se.emotion = percentile[fact] doc['emotion'] = percentile[fact] if fact == 'suitable_stage': se.suitable_stage = percentile[fact] if fact == 'liking.keyword': se.liking = percentile[fact] se.hedonics = percentile[fact] doc['liking'] = percentile[fact] doc['hedonics'] = percentile[fact] if fact == 'freshness': se.freshness = percentile[fact] doc['freshness'] = percentile[fact] count = count + 1 #data = elastic.convert_for_bulk(se, 'update') data = elastic.add_to_bulk(index, doc_type, doc, 'update') bulk_data.append(data) if count > 100: bulk(models.client, actions=bulk_data, stats_only=True) total_count = total_count + count print( "load_studies_facts: written another batch, total written {0:d}" .format(total_count)) bulk_data = [] count = 1 #if '_id' in doc: # id = doc['_id'] # doc.pop("_id", None) #else: # id = str(count) #data = json.dumps(doc) #print("load_studies_facts: write fact line with id", id) #r = requests.put(url + "/" + doc_type + "/" + id, headers=headers, data=data) #print("load_excel: written excel line with id", id) bulk(models.client, actions=bulk_data, stats_only=True) pass