def clone_index(createidx=False, test=True): if test: return from utils.es import ESIndexer from utils.common import iter_n new_idx = 'myvariant_current_3' step = 10000 if createidx: from mapping import get_mapping m = get_mapping() body = {'settings': {'number_of_shards': 10}} # ### es.indices.create(new_idx, body=body) es.indices.put_mapping(index=new_idx, doc_type='variant', body=m) # helpers.reindex(es, source_index='myvariant_all', # target_index= new_idx, chunk_size=10000) esi = ESIndexer() doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step) for doc_batch in iter_n(doc_iter, step): do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
import os from collections import defaultdict from mapping import get_mapping from pymongo import MongoClient import ast import math import numpy as np import json import csv DATADIR = './data/' DATAFILE = 'earn_nt_net.tsv' DATAFILE_OUT = 'tax_ratio.csv' MAPPING = get_mapping() def parse_file(datafile): data = [] with open(datafile, 'rU') as f: header = f.readline().split(",") header[3:] = [col for col in header[3].split("\t")] header[3] = 'country' # use a friendly name for line in f: fields = line.split(",") fields[3:] = [col for col in fields[3].split("\t")] entry = {} for i, value in enumerate(fields): entry[header[i].strip()] = value.strip() data.append(entry) return data
def create_index(index_name, mapping=None): body = {'settings': {'number_of_shards': 20}} mapping = mapping or get_mapping() mapping = {"mappings": mapping} body.update(mapping) es.indices.create(index=index_name, body=body)