def crawl_studies_facts(survey_field, facts_d):
    bulk_data = []
    count = 0
    total_count = 0
    facts_df = DataFrame.from_dict(facts_d, orient='index')
    facts_df['blindcode'] = [ix[0] for ix in facts_df.index]
    facts_df['fact'] = [ix[1] for ix in facts_df.index]
    facts_df['answer'] = [ix[2] for ix in facts_df.index]

    for blindcode, facts_blindcode_df in facts_df.groupby(
            facts_df['blindcode']):
        se = models.StudiesMap()
        se.cft_id = blindcode
        se.dataset = survey_field
        se.ingr_name = blindcode
        se.IPC = blindcode
        percentile = {}

        for idx, fact_s in facts_blindcode_df.iterrows():
            fact = fact_s['fact']
            answer = fact_s['answer']
            #se.supplier = "CI"
            #se.olfactive = cft_s.olfactive
            #se.region = cft_s.region
            #se.review = cft_s.review
            #se.dilution = cft_s.dilution
            #se.intensity = cft_s.intensity

            if fact not in percentile.keys():
                percentile[fact] = []
            val = answer
            prc = fact_s[0]
            if prc > 0:
                percentile[fact].append((val, prc))

        for fact in percentile.keys():
            if fact == 'emotion':
                se.emotion = percentile[fact]
            if fact == 'suitable_stage':
                se.suitable_stage = percentile[fact]
            if fact == 'hedonics':
                se.hedonics = percentile[fact]
            if fact == 'freshness':
                se.freshness = percentile[fact]

        data = elastic.convert_for_bulk(se, 'update')
        bulk_data.append(data)
        count = count + 1
        if count > 100:
            bulk(models.client, actions=bulk_data, stats_only=True)
            total_count = total_count + count
            print(
                "crawl_studies_facts: written another batch, total written {0:d}"
                .format(total_count))
            bulk_data = []
            count = 1

    bulk(models.client, actions=bulk_data, stats_only=True)
    pass
Example #2
0
def load_studies_facts(survey_field, facts_d):
    es_host = ES_HOSTS[0]
    headers = {}
    if 'http_auth' in es_host:
        headers['http_auth'] = es_host['http_auth']
    host = es_host['host']
    index = 'studies'
    doc_type = 'studies'
    url = "http://" + host + ":9200/" + index

    bulk_data = []
    count = 0
    total_count = 0
    facts_df = DataFrame.from_dict(facts_d, orient='index')
    facts_df['blindcode'] = [ix[0] for ix in facts_df.index]
    facts_df['fact'] = [ix[1] for ix in facts_df.index]
    facts_df['answer'] = [ix[2] for ix in facts_df.index]

    for blindcode, facts_blindcode_df in facts_df.groupby(
            facts_df['blindcode']):
        se = models.StudiesMap()
        se.cft_id = blindcode.split('-')[0]
        se.survey = survey_field
        se.blindcode = blindcode
        percentile = {}
        doc = None
        doc = {}
        doc['_id'] = blindcode.split('-')[0]
        doc['cft_id'] = blindcode.split('-')[0]
        doc['survey'] = survey_field
        doc['blindcode'] = blindcode

        for idx, fact_s in facts_blindcode_df.iterrows():
            fact = fact_s['fact']
            answer = fact_s['answer']
            #se.supplier = "CI"
            #se.olfactive = cft_s.olfactive
            #se.region = cft_s.region
            #se.review = cft_s.review
            #se.dilution = cft_s.dilution
            #se.intensity = cft_s.intensity

            if fact not in percentile.keys():
                percentile[fact] = []
            val = str(answer)
            prc = fact_s[0]
            if prc > 0 and val != 'Total':
                #percentile[fact].append((val, prc))
                percentile[fact].append({'val': val, 'prc': prc})

        for fact in percentile.keys():
            if fact == 'emotion':
                se.emotion = percentile[fact]
                doc['emotion'] = percentile[fact]
            if fact == 'suitable_stage':
                se.suitable_stage = percentile[fact]
            if fact == 'liking.keyword':
                se.liking = percentile[fact]
                se.hedonics = percentile[fact]
                doc['liking'] = percentile[fact]
                doc['hedonics'] = percentile[fact]
            if fact == 'freshness':
                se.freshness = percentile[fact]
                doc['freshness'] = percentile[fact]
        count = count + 1
        #data = elastic.convert_for_bulk(se, 'update')
        data = elastic.add_to_bulk(index, doc_type, doc, 'update')
        bulk_data.append(data)
        if count > 100:
            bulk(models.client, actions=bulk_data, stats_only=True)
            total_count = total_count + count
            print(
                "load_studies_facts: written another batch, total written {0:d}"
                .format(total_count))
            bulk_data = []
            count = 1

        #if '_id' in doc:
        #    id = doc['_id']
        #    doc.pop("_id", None)
        #else:
        #    id = str(count)
        #data = json.dumps(doc)
        #print("load_studies_facts: write fact line with id", id)
        #r = requests.put(url + "/" + doc_type + "/" + id, headers=headers, data=data)
        #print("load_excel: written excel line with id", id)

    bulk(models.client, actions=bulk_data, stats_only=True)
    pass