def set_in_index(self, documentList):
        """
        Store the list of documents in the Elasticsearch index via HTTP APIs

        @type  documentList: List
        @param documentList: List of image layer JSON documents
        """
        #Get the Elasticsearch address from the config file
        cfg = config.load()

        #Store the document list in Elasticsearch
        es = ElasticSearch(cfg.search_options.get("address"))
        try:
            es.bulk_index(cfg.search_options.get("index"),
                          cfg.search_options.get("type"),
                          documentList,
                          id_field='id')
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except Timeout:
            logger.debug("Timeout!")
        except ConnectionError:
            logger.debug("ConnectionError!")
        except ElasticHttpError:
            logger.debug("ElasticHttpError!")
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except ElasticHttpNotFoundError:
            logger.debug("ElasticHttpNotFoundError!")
Example #2
0
class ESLayers(object):
    """Implementation of Elastic Search as layers backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def _transform(self, layer, version, layer_name):
        """Add some meta data fields which are ES specific"""
        layer = dict(layer)     # copy
        label = layer['label']
        del layer['label']
        return {
            'id': '%s/%s/%s' % (version, layer_name, label),
            'version': version,
            'name': layer_name,
            'label': label,
            'layer': layer
        }

    def bulk_put(self, layers, version, layer_name, root_label):
        """Store all layer objects"""
        self.es.bulk_index(
            settings.ELASTIC_SEARCH_INDEX, 'layer',
            map(lambda l: self._transform(l, version, layer_name),
                layers))

    def get(self, name, label, version):
        """Find the layer that matches these parameters"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer',
                                 version + '/' + name + '/' + label)

            return result['_source']['layer']
        except ElasticHttpNotFoundError:
            return None
Example #3
0
class ESLayers(object):
    """Implementation of Elastic Search as layers backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def _transform(self, layer, version, layer_name):
        """Add some meta data fields which are ES specific"""
        layer = dict(layer)  # copy
        label = layer['label']
        del layer['label']
        return {
            'id': '%s/%s/%s' % (version, layer_name, label),
            'version': version,
            'name': layer_name,
            'label': label,
            'layer': layer
        }

    def bulk_put(self, layers, version, layer_name, root_label):
        """Store all layer objects"""
        self.es.bulk_index(
            settings.ELASTIC_SEARCH_INDEX, 'layer',
            map(lambda l: self._transform(l, version, layer_name), layers))

    def get(self, name, label, version):
        """Find the layer that matches these parameters"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer',
                                 version + '/' + name + '/' + label)

            return result['_source']['layer']
        except ElasticHttpNotFoundError:
            return None
    def set_in_index(self, documentList):
        """
        Store the list of documents in the Elasticsearch index via HTTP APIs

        @type  documentList: List
        @param documentList: List of image layer JSON documents
        """
        #Get the Elasticsearch address from the config file
        cfg = config.load()

        #Store the document list in Elasticsearch
        es = ElasticSearch(cfg.search_options.get("address"))
        try:
            es.bulk_index(cfg.search_options.get("index"), cfg.search_options.get("type"), documentList, id_field='id')
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except Timeout:
            logger.debug("Timeout!")
        except ConnectionError:
            logger.debug("ConnectionError!")
        except ElasticHttpError:
            logger.debug("ElasticHttpError!")
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except ElasticHttpNotFoundError:
            logger.debug("ElasticHttpNotFoundError!")            
Example #5
0
def import_json_into_es(types, inputfolder, logger):
    """
    imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch
    :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'}
    :param inputfolder:
    :param logger:
    :return:
    """

    es = ElasticSearch(config.ELASTICSEARCH_URL)

    try:
        es.delete_index('wikidata')
        es.create_index('wikidata')
        logger.info('rebuild index [wikidata]')
    except:
        logger.warning('cant delete wikidata index')


    # convert type dictionary
    wd_types = dict()
    for key in types.keys():
        value = int(types[key].split('/')[-1][1:])
        wd_types[value] = {'type': key,
                           'filename': path.join(inputfolder, '{}.json.bz2'.format(key))}


    # import each given type
    for key in wd_types:
        logger.info(wd_types[key])

        done = 0
        items = []

        for line in BZ2File(wd_types[key]['filename'],'rb'):
            line = line.strip()
            item = loads(line)
            item['uri'] = 'http://wikidata.org/wiki/' + item['id']

            items.append(item)
            done += 1

            if ( done % 5000 == 0 ):
                es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id')
                items = []

            # if done % len(wd_types) / 10 == 0: # log 10% steps
            #     logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done ))

            if done % 10000 == 0:
                logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))

        if len(items) > 0:
            es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id')
        logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))
Example #6
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name,
                            settings={
                                'number_of_shards': 1,
                                'number_of_replicas': 0,
                                'analysis': {
                                    'analyzer': {
                                        'default': {
                                            'type': 'custom',
                                            'tokenizer': 'keyword'
                                        }
                                    }
                                },
                                'store': {
                                    'compress': {
                                        'stored': 'true'
                                    }
                                },
                            })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {
                'date': date,
                'os': random.choice(platforms),
                'downloads_count': random.randint(1000, 1500),
                'users_count': random.randint(10000, 15000),
                'add_on': add_on + 1
            }
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')
Example #7
0
def index_data(data_source, index_name, doc_type):
    es = ElasticSearch(urls='http://localhost', port=9200)
    try:
        es.delete_index(index_name)
    except:
        pass
    es.create_index(index_name)
    try:
        es.bulk_index(index_name, doc_type, data_source)
    except:
        print("Error! Skipping Document...!")
        pass
Example #8
0
class ESRegulations(object):
    """Implementation of Elastic Search as regulations backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def get(self, label, version):
        """Find the regulation label + version"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                                 version + '/' + label)

            reg_node = result['_source']
            del reg_node['regulation']
            del reg_node['version']
            del reg_node['label_string']
            del reg_node['id']
            return reg_node
        except ElasticHttpNotFoundError:
            return None

    def _transform(self, reg, version):
        """Add some meta data fields which are ES specific"""
        node = dict(reg)  # copy
        node['version'] = version
        node['label_string'] = '-'.join(node['label'])
        node['regulation'] = node['label'][0]
        node['id'] = version + '/' + node['label_string']
        node['root'] = len(node['label']) == 1
        return node

    def bulk_put(self, regs, version, root_label):
        """Store all reg objects"""
        self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                           map(lambda r: self._transform(r, version), regs))

    def listing(self, label=None):
        """List regulation version-label pairs that match this label (or are
        root, if label is None)"""
        if label is None:
            query = {'match': {'root': True}}
        else:
            query = {'match': {'label_string': label}}
        query = {'fields': ['label_string', 'version'], 'query': query}
        result = self.es.search(query,
                                index=settings.ELASTIC_SEARCH_INDEX,
                                doc_type='reg_tree',
                                size=100)
        return sorted((res['fields']['version'], res['fields']['label_string'])
                      for res in result['hits']['hits'])
Example #9
0
class ESRegulations(object):
    """Implementation of Elastic Search as regulations backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def get(self, label, version):
        """Find the regulation label + version"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                                 version + '/' + label)

            reg_node = result['_source']
            del reg_node['regulation']
            del reg_node['version']
            del reg_node['label_string']
            del reg_node['id']
            return reg_node
        except ElasticHttpNotFoundError:
            return None

    def _transform(self, reg, version):
        """Add some meta data fields which are ES specific"""
        node = dict(reg)    # copy
        node['version'] = version
        node['label_string'] = '-'.join(node['label'])
        node['regulation'] = node['label'][0]
        node['id'] = version + '/' + node['label_string']
        node['root'] = len(node['label']) == 1
        return node

    def bulk_put(self, regs, version, root_label):
        """Store all reg objects"""
        self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                           map(lambda r: self._transform(r, version), regs))

    def listing(self, label=None):
        """List regulation version-label pairs that match this label (or are
        root, if label is None)"""
        if label is None:
            query = {'match': {'root': True}}
        else:
            query = {'match': {'label_string': label}}
        query = {'fields': ['label_string', 'version'], 'query': query}
        result = self.es.search(query, index=settings.ELASTIC_SEARCH_INDEX,
                                doc_type='reg_tree', size=100)
        return sorted((res['fields']['version'], res['fields']['label_string'])
                      for res in result['hits']['hits'])
Example #10
0
def index_data(data_path, chunksize, index_name, doc_type):
    f = open(data_path)
    csvfile = pd.read_csv(f, iterator=True, chunksize=chunksize)
    es = ElasticSearch(urls='http://localhost', port=9200)
    try:
        es.delete_index(index_name)
    except:
        pass
    es.create_index(index_name)
    for i, df in enumerate(csvfile):
        records = df.where(pd.notnull(df), None).T.to_dict()
        records_list = [records[i] for i in records]
        try:
            es.bulk_index(index_name, doc_type, records_list)
        except:
            print("Error! Skipping chunk...!")
            pass
Example #11
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name, settings={
            'number_of_shards': 1,
            'number_of_replicas': 0,
            'analysis': {'analyzer': {'default': {
                'type': 'custom', 'tokenizer': 'keyword'
            }}},
            'store': {'compress': {'stored': 'true'}},
        })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {'date': date,
                    'os': random.choice(platforms),
                    'downloads_count': random.randint(1000, 1500),
                    'users_count': random.randint(10000, 15000),
                    'add_on': add_on + 1}
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')
	def indexNodes(self):

		es = ElasticSearch('http://0.0.0.0:9200')

		i = 0
		for file in os.listdir(self.dataIndexNodes):

			if file.endswith('.json'):

				with open(self.dataIndexNodes + file, "r") as f:

					nodes = json.loads(f.read())
					print ("Indexing Node data", self.dataIndexNodes + file, len(nodes))

					bulkCount = 0
					bulkAry = []

					for node in nodes:




						i += 1

						if (i < 170000):
							continue

						bulkCount = bulkCount + 1
						bulkAry.append(node);

						if bulkCount == 1000:

							es.bulk_index('nodes','node',bulkAry, id_field='id')
							bulkCount = 0
							bulkAry = []


						print i

					if len(bulkAry) != 0:
						es.bulk_index('nodes','node',bulkAry, id_field='id')
Example #13
0
    def send(self, messages):
        if self.type == '@type':
            self.type = messages[0].get('@type')
            logger.debug('Type is \'@type\' - setting it to %r', self.type)

        es = ElasticSearch('http://%s:%s' % (self.host, self.port))

        now = datetime.utcnow()
        index = now.strftime('logstash-%Y.%m.%d')

        result = es.bulk_index(index=index, doc_type=self.type, docs=messages)
        logger.debug('Elasticsearch bulk_index run returned with:\n\n%s\n',
                     pformat(result))
        return True
Example #14
0
def load(args):
    """
    Load jobs from external data sources.
    """
    es = ElasticSearch(args.elastic_search_url)
    default_providers = {
        'github': providers.Github(),
        'indeed': providers.Indeed(args.indeed_api_key),
        'craigslist': providers.Craigslist()
    }
    chosen_providers = get_providers(
        args.providers, default_providers) or set(default_providers.values())
    excluded_providers = get_providers(
        args.exclude_providers, default_providers) or set()

    for provider in chosen_providers - excluded_providers:
        name = provider.name
        params = {
            'location': args.location,
            'query': args.query
        }
        data = provider.get(**params)
        tagline = '{name} data for location {location} and ' \
                  'query {query}'.format(name=name, **params)

        try:
            result = es.bulk_index(provider.name.lower(), 'job', data)
        except ValueError:
            print('Skipping {tagline}. 0 items found.'.format(tagline=tagline))
            continue

        num_items = len(result['items'])

        print('Loaded {tagline}. Result: {num_items} jobs in {time} '
              'seconds'.format(tagline=tagline, num_items=num_items,
                               time=result['took']))
Example #15
0
def import_json_into_es(types, inputfolder, logger):
    """
    imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch
    :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'}
    :param inputfolder:
    :param logger:
    :return:
    """

    es = ElasticSearch(config.ELASTICSEARCH_URL)

    try:
        es.delete_index('wikidata')
        es.create_index('wikidata')
        logger.info('rebuild index [wikidata]')
    except:
        logger.warning('cant delete wikidata index')

    # convert type dictionary
    wd_types = dict()
    for key in types.keys():
        value = int(types[key].split('/')[-1][1:])
        wd_types[value] = {
            'type': key,
            'filename': path.join(inputfolder, '{}.json.bz2'.format(key))
        }

    # import each given type
    for key in wd_types:
        logger.info(wd_types[key])

        done = 0
        items = []

        for line in BZ2File(wd_types[key]['filename'], 'rb'):
            line = line.strip()
            item = loads(line)
            item['uri'] = 'http://wikidata.org/wiki/' + item['id']

            items.append(item)
            done += 1

            if (done % 5000 == 0):
                es.bulk_index('wikidata',
                              wd_types[key]['type'],
                              items,
                              id_field='id')
                items = []

            # if done % len(wd_types) / 10 == 0: # log 10% steps
            #     logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done ))

            if done % 10000 == 0:
                logger.info('imported {}: {}'.format(wd_types[key]['type'],
                                                     format(done, ',d')))

        if len(items) > 0:
            es.bulk_index('wikidata',
                          wd_types[key]['type'],
                          items,
                          id_field='id')
        logger.info('imported {}: {}'.format(wd_types[key]['type'],
                                             format(done, ',d')))
Example #16
0
class TestClient(unittest.TestCase):

    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(client('downloads_count', START, '2012-05-01',
                           interval='month', add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d
Example #17
0
client = app.test_client()
ctx = app.test_request_context()
ctx.push()

es = ElasticSearch('http://localhost:9200/')
try:
    es.delete_index('cdpp')
except ElasticHttpNotFoundError:
    # we can safely ignore this, because it might be an initial run
    pass
res = db.session.query(Sign).all()
for r in res:
    d = r.__dict__
    d.pop('_sa_instance_state', None)
# bulk-index the cleaned signs
es.bulk_index('cdpp', 'sign', [r.__dict__ for r in res], id_field='id')



tablets = db.session.query(Tablet).all()
repr = []
for result in tablets:
    d = result.__dict__
    keys = ['medium', 'city', 'locality', 'period', 'sub_period', 'text_vehicle', 'method', 'genre', 'museum_number']
    as_dict = {}
    for key in keys:
        value = getattr(result, key)
        if value:
            as_dict[key] = unicode(value)
    if result.rulers:
        as_dict['ruler'] = result.rulers[0].name
Example #18
0
File: index.py Project: Yenlo/ddld
                            "type": "string",
                            "index": "analyzed"
                            },
                        "untouched": {
                            "type": "string",
                            "index": "not_analyzed"
                            }
                        }
                   },
                "topics": {
                    "type": "multi_field",
                    "fields": {
                        "topics": {
                            "type": "string",
                            "index": "analyzed"
                            },
                        "untouched": {
                            "type": "string",
                            "index": "not_analyzed"
                            }
                        }
                   }
                }
            }
        }

    es.create_index(index, {"mappings": mapping})
    # es.put_mapping(index, doc_type, mapping)

    es.bulk_index(index, doc_type, get_docs(fname), 'persistent_id')
Example #19
0
def ES_bulk_insert(file_name):
    index_name = "geodata"
    doc_type = "data"

    # ElasticSearch URL
    ElasticSearch_URL = "http://localhost:9200/"

    file_path_name = raw_data_path + "/" + file_name

    t0 = time()

    #Bulk size to be import the records in elasticsearch
    chunk_size = 5000

    txt_file = pd.read_csv(
        file_path_name,
        sep="\t",
        iterator=True,
        chunksize=chunk_size,
        header=None,
        names=[
            'geonameid', 'name', 'asciiname', 'alternatenames', 'latitude',
            'longitude', 'feature_class', 'feature_code', 'country_code',
            'cc2', 'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code',
            'population', 'elevation', 'dem', 'timezone', 'modification_date'
        ],
        dtype={
            "geonameid": int64,
            "name": object,
            "asciiname": object,
            "alternatenames": object,
            "latitude": float64,
            "longitude": float64,
            "feature_class": object,
            "feature_code": object,
            "country_code": object,
            "cc2": object,
            "admin1_code": object,
            "admin2_code": object,
            "admin3_code": object,
            "admin4_code": object,
            "population": int64,
            "elevation": object,
            "dem": int64,
            "timezone": object,
            "modification_date": object
        })

    # Connecting to ElasticSearch
    es = ElasticSearch(ElasticSearch_URL)
    print("Data Import started for file ", file_name)

    #Insert the Dataframe to elasticsearch using bulk
    for i, df in enumerate(txt_file):
        print(i)
        records = df.where(pd.notnull(df), None).T.to_dict()
        list_records = [records[it] for it in records]
        try:
            es.bulk_index(index_name, doc_type, list_records)
        except:
            print("Error.. Skipping some records")
            pass

    print("File ", file_name, "imported in %.3fs" % (time() - t0))
Example #20
0
	except Exception, e:
		print e
	else :	
		print "Created flights"


	s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}'))



def md(a) :
    a["datum"]=a["starttime"][:10]
    return a
    
    
def makets(a) :
    for f in ("starttime","endtime") :
        a[f]=maket(a[f])
    return a



d=simplejson.load(sys.stdin)
chunksize=50
print "%s documents" % (len(d),)
for i in xrange(0,len(d),chunksize) :
	s.bulk_index("flights","flight",d[i:i+chunksize])	
	print "inserted %s starting from %s" % (chunksize,i)


def build_es_index(raw_data_path):
    if "TOPOGRAM_TMP_PATH" in os.environ:
        tmp_path=os.environ.get('TOPOGRAM_TMP_PATH')
    else: tmp_path='/tmp'

    # raw_data_path=os.path.join(raw_path,"data/datazip/selected/")
    pid_file=os.path.join(tmp_path,"csv_chunk")

    # config elasticsearch
    if "TOPOGRAM_ES_HOST" in os.environ:
        es_host=os.environ.get('TOPOGRAM_ES_HOST')
    else: es_host='http://localhost:9200/'

    # init ElasticSearch
    es = ElasticSearch(es_host)

    # size of CSV chunk to process
    chunksize=1000

    # parse index name : 2 weeks per index to fasten search
    weeks={}
    for r in xrange(1,52,2):
        weeks[r]=weeks[r+1]="weiboscope_"+str(r)+"_"+str(r+1)

    # for w in  weeks: print w,weeks[w]

    # init
    previous_chunk=0
    t0=time()

    for path, subdirs, files in os.walk(raw_data_path):
        
        # loop through each files
        i_file=0
        for filename in files: 
            # if i==1 : break

            file_is_ok=False

            # check if there is an ongoing task
            if filename[-10:] == "processing":
                
                file_is_ok=True

                # get previous
                with open(pid_file, "r") as pid:
                    previous_chunk=int(pid.read())

                # get previous
                file_to_process_name=filename

            elif filename[-3:] == "zip" and filename[:4] == "week": # get only zip files

                file_is_ok=True
            
            if file_is_ok==True :
                t1=time()
                i_file+=1
            
                # flag the file
                zip_path=os.path.join(path,filename)
                # print zip_path
                if filename[-10:] != "processing": 
                    os.rename(zip_path, zip_path+".processing")
                    zip_path=os.path.join(path,filename+".processing")

                raw_csvname=filename.split(".")[0]+".csv" 
                
                # read zipped csv files
                with zipfile.ZipFile(zip_path) as z: # open zip

                    f = z.open(raw_csvname) # read csv
                    csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) 
                    
                    week_number=filename.split(".")[0][4:]
                    index_name=weeks[int(week_number)]
                    # print index_name

                    for i,df in enumerate(csvfile):

                        if i <= previous_chunk:
                            print i, "%d files, already indexed %s"%(i_file,raw_csvname)
                        else:
                            print i, "%d files, now indexing %s"%(i_file,raw_csvname)

                            # fix the date formatting
                            df["created_at"]=df["created_at"].str.replace(" ", "T")
                            
                            try :
                                
                                # fix encoding
                                df["text"]=df["text"].str.decode("utf-8")

                                # convert dataframe to json object
                                records=df.where(pd.notnull(df), None).T.to_dict()

                                # convert json object to a list of json objects
                                list_records=[records[it] for it in records]

                                # insert into elasticsearch
                                try :
                                    es.bulk_index(index_name,"tweet",list_records)
                                except :
                                    print "error with elasticsearch"
                                    pass
                                    
                            except :
                                print "encoding problem..."
                                pass

                            with open(pid_file, "w") as pid:
                                pid.write(str(i))

                print "%s processed in %.3fs"%(raw_csvname,time()-t1)

                # flag the file : done
                # os.rename(zip_path, zip_path+".done")
                os.remove(zip_path)
                
                # reset counters
                previous_chunk=0
                with open(pid_file, "w") as pid:
                    pid.write(str(0))

    print "Everything done in %.3fs"%(time()-t0)
DELIMITER = str(input5)
ERRORFILEOUTPUT = open(input6, "w")

print "Running..."

for line in FILENAME:
    fields = line.split(DELIMITER)
    data.append({
        "Filename": fields[0].strip(),
        "File Type": fields[1].strip(),
        "Language1": fields[2].strip(),
        "Language2": fields[3].strip(),
        "Language3": fields[4].strip()
    })
    try:
        conn.bulk_index(PROJECTNAME, INDEXNAME, data)
        correct_counter += 1
    except Exception as e:
        for i in data:
            failurelist.append(data)
        failed_count += 1
    data = []

if correct_counter != 0:
    print "\n", correct_counter, " rows were successfully loaded into ES \n"
elif correct_counter == 0:
    print "\n", correct_counter, " rows were loaded into elasticsearch.\n"

if (failed_count != 0) & (correct_counter != 0):
    print failed_count, "rows failed to load - check error output file to see the specific data"
elif (failed_count == 0) & (correct_counter != 0):
class TestClient(unittest.TestCase):
    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(
            client('downloads_count',
                   START,
                   '2012-05-01',
                   interval='month',
                   add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d
Example #24
0
def build_es_index(raw_data_path):
    if "TOPOGRAM_TMP_PATH" in os.environ:
        tmp_path = os.environ.get('TOPOGRAM_TMP_PATH')
    else:
        tmp_path = '/tmp'

    # raw_data_path=os.path.join(raw_path,"data/datazip/selected/")
    pid_file = os.path.join(tmp_path, "csv_chunk")

    # config elasticsearch
    if "TOPOGRAM_ES_HOST" in os.environ:
        es_host = os.environ.get('TOPOGRAM_ES_HOST')
    else:
        es_host = 'http://localhost:9200/'

    # init ElasticSearch
    es = ElasticSearch(es_host)

    # size of CSV chunk to process
    chunksize = 1000

    # parse index name : 2 weeks per index to fasten search
    weeks = {}
    for r in xrange(1, 52, 2):
        weeks[r] = weeks[r + 1] = "weiboscope_" + str(r) + "_" + str(r + 1)

    # for w in  weeks: print w,weeks[w]

    # init
    previous_chunk = 0
    t0 = time()

    for path, subdirs, files in os.walk(raw_data_path):

        # loop through each files
        i_file = 0
        for filename in files:
            # if i==1 : break

            file_is_ok = False

            # check if there is an ongoing task
            if filename[-10:] == "processing":

                file_is_ok = True

                # get previous
                with open(pid_file, "r") as pid:
                    previous_chunk = int(pid.read())

                # get previous
                file_to_process_name = filename

            elif filename[
                    -3:] == "zip" and filename[:4] == "week":  # get only zip files

                file_is_ok = True

            if file_is_ok == True:
                t1 = time()
                i_file += 1

                # flag the file
                zip_path = os.path.join(path, filename)
                # print zip_path
                if filename[-10:] != "processing":
                    os.rename(zip_path, zip_path + ".processing")
                    zip_path = os.path.join(path, filename + ".processing")

                raw_csvname = filename.split(".")[0] + ".csv"

                # read zipped csv files
                with zipfile.ZipFile(zip_path) as z:  # open zip

                    f = z.open(raw_csvname)  # read csv
                    csvfile = pd.read_csv(f,
                                          iterator=True,
                                          chunksize=chunksize)

                    week_number = filename.split(".")[0][4:]
                    index_name = weeks[int(week_number)]
                    # print index_name

                    for i, df in enumerate(csvfile):

                        if i <= previous_chunk:
                            print i, "%d files, already indexed %s" % (
                                i_file, raw_csvname)
                        else:
                            print i, "%d files, now indexing %s" % (
                                i_file, raw_csvname)

                            # fix the date formatting
                            df["created_at"] = df["created_at"].str.replace(
                                " ", "T")

                            try:

                                # fix encoding
                                df["text"] = df["text"].str.decode("utf-8")

                                # convert dataframe to json object
                                records = df.where(pd.notnull(df),
                                                   None).T.to_dict()

                                # convert json object to a list of json objects
                                list_records = [records[it] for it in records]

                                # insert into elasticsearch
                                try:
                                    es.bulk_index(index_name, "tweet",
                                                  list_records)
                                except:
                                    print "error with elasticsearch"
                                    pass

                            except:
                                print "encoding problem..."
                                pass

                            with open(pid_file, "w") as pid:
                                pid.write(str(i))

                print "%s processed in %.3fs" % (raw_csvname, time() - t1)

                # flag the file : done
                # os.rename(zip_path, zip_path+".done")
                os.remove(zip_path)

                # reset counters
                previous_chunk = 0
                with open(pid_file, "w") as pid:
                    pid.write(str(0))

    print "Everything done in %.3fs" % (time() - t0)
Example #25
0
                        df["created_at"]=df["created_at"].str.replace(" ", "T")
                        
                        try :
                            
                            # fix encoding
                            df["text"]=df["text"].str.decode("utf-8")

                            # convert dataframe to json object
                            records=df.where(pd.notnull(df), None).T.to_dict()

                            # convert json object to a list of json objects
                            list_records=[records[it] for it in records]

                            # insert into elasticsearch
                            try :
                                es.bulk_index(index_name,"tweet",list_records)
                            except :
                                print "error with elasticsearch"
                                pass
                                
                        except :
                            print "encoding problem..."
                            pass

                        with open(pid_file, "w") as pid:
                            pid.write(str(i))

            print "%s processed in %.3fs"%(raw_csvname,time()-t1)

            # flag the file : done
            # os.rename(zip_path, zip_path+".done")
csv_filename='robinhood-daily-rets.csv'
# size of the bulk
chunksize=5000

# parse csv with pandas
csvfile=pd.read_csv(csv_filename)

# init ElasticSearch
es = ElasticSearch('http://104.236.201.91:9200/')

# init index
try :
    es.delete_index("robinhood")
except :
    pass

es.create_index("robinhood")

# start bulk indexing
print("now indexing %s..."%(csv_filename))

records=csvfile.where(pd.notnull(csvfile), None).T.to_dict()
list_records=[records[it] for it in records]
try :
    es.bulk_index("robinhood","myPortfolio",list_records)
except :
    print("error!, skipping a date")
    pass

print("done in %.3fs"%(time()-t0))
ERRORFILEOUTPUT = open(input6, "w")

print "Running..."

for line in FILENAME:
	fields = line.split(DELIMITER) 
	if len(fields) == NUM_OF_FIELDS:
		data.append({
			"Filename" : fields[0].strip(),
			"File Type" : fields[1].strip(),
			"Language1" : fields[2].strip(),
			"Language2" : fields[3].strip(),
			"Language3" : fields[4].strip()
 			}) 
		try:
 			conn.bulk_index(PROJECTNAME,INDEXNAME,data)
			correct_counter += 1 
		except Exception as e: 
 			for i in data:
				failurelist.append(data)
 			failed_count += 1
 		data = []

if correct_counter!=0:
	print "\n",correct_counter," rows were successfully loaded into ES \n"
elif correct_counter==0:
	print "\n",correct_counter," rows were loaded into elasticsearch.\n" 

if (failed_count != 0) & (correct_counter != 0):
	print failed_count,"rows failed to load - check error output file to see the specific data"
elif (failed_count == 0) & (correct_counter != 0):
# open csv file
f = open(raw_data_path+csv_filename) # read csv

# parse csv with pandas
csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) 

# init ElasticSearch
es = ElasticSearch('http://localhost:9200/')

# init index
try :
    es.delete_index("weiboscope")
except :
    pass

es.create_index("weiboscope")

# start bulk indexing 
print ("now indexing %s..."%(csv_filename))

for i,df in enumerate(csvfile): 
    print (i)
    records=df.where(pd.notnull(df), None).T.to_dict()
    list_records=[records[it] for it in records]
    try :
        es.bulk_index("weiboscope","tweet",list_records)
    except :
        print ("error!, skiping some tweets sorry")
        pass

print( "done in %.3fs"%(time()-t0))
Example #29
0
def documents_from_mails(mails):
    """Build document from mail"""
    for mail in mails:
        if 'Date' in mail.headers:  # Some mails seem broken.
            yield {
                '@source': 'stuff://',
                '@type': 'mailadmin',
                '@tags': [mail.headers['From']],
                '@fields': mail.headers,
                '@timestamp': parse_date(mail.headers['Date']),
                '@source_host': 'localhost',
                '@source_path': 'mail/admin ',
                '@message': mail.body,
                'id': mail.headers['Message-Id']
            }

if __name__ == '__main__':
    # Instantiate it with an url
    es = ElasticSearch(sys.argv[1])
    # Kibana need this kind of name
    NAME = 'logstash-2013.06.13'
    try:
        es.delete_index(NAME)
    except ElasticHttpNotFoundError:
        pass  # Nobody cares
    emails = mbox(sys.argv[2])
    for n, docs in enumerate(bulk_iterate(documents_from_mails(emails), 100)):
        es.bulk_index(NAME, 'mailadmin', docs)
        print(n)
    print es.refresh(NAME)