class ESDiffs(object): """Implementation of Elastic Search as diff backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) @staticmethod def to_id(label, old, new): return "%s/%s/%s" % (label, old, new) def put(self, label, old_version, new_version, diff): """Store a diff between two versions of a regulation node""" struct = { 'label': label, 'old_version': old_version, 'new_version': new_version, 'diff': diff } self.es.index(settings.ELASTIC_SEARCH_INDEX, 'diff', struct, id=self.to_id(label, old_version, new_version)) def get(self, label, old_version, new_version): """Find the associated diff""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff', self.to_id(label, old_version, new_version)) return result['_source']['diff'] except ElasticHttpNotFoundError: return None
def load_pie( filename, index_name, type_name, category, name, zone="France", sep=";", display="pie", source="", description="" ): f = open(filename, mode="r") es = ElasticSearch(CONTEXT["datahub-store"]) categories = {} for line in f: key, string_value = line.split(sep, 2) value = cjson.decode(string_value) categories[key] = value serie = { "name": name, "owner": "public", "display": display, "zone": zone, "category": category, "source": source, "description": description % (key), "data": {"categories": categories.keys(), "series": [{"data": categories.values()}]}, } es.index(index_name, display, serie) es.refresh(index_name) f.close() es.refresh(index_name) f.close()
def update_process_datetime(doc_id, timestamp): ''' Updates the last_update_date for the document id passed into function. The document id in will be the name of another index in the cluster. ''' connection_string = 'http://localhost:9200' process_index = 'openfdametadata' _type = 'last_run' _map = {} _map[_type] = {} _map[_type]['properties'] = {} _map[_type]['properties']['last_update_date'] = {} _map[_type]['properties']['last_update_date']['type'] = 'date' _map[_type]['properties']['last_update_date']['format'] = 'dateOptionalTime' es = ElasticSearch(connection_string) try: es.create_index(process_index) logging.info('Creating index %s', process_index) except exceptions.IndexAlreadyExistsError as e: logging.info('%s already exists', process_index) try: es.put_mapping(process_index, doc_type=_type, mapping=_map) logging.info('Successfully created mapping') except: logging.fatal('Could not create the mapping') new_doc = {} new_doc['last_update_date'] = timestamp es.index(process_index, doc_type=_type, id=doc_id, doc=new_doc, overwrite_existing=True)
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_part': part}} else: query = {'match_all': {}} query = {'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query} notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
def update_process_datetime(doc_id, timestamp): ''' Updates the last_update_date for the document id passed into function. The document id in will be the name of another index in the cluster. ''' connection_string = 'http://localhost:9200' process_index = 'openfdametadata' _type = 'last_run' _map = {} _map[_type] = {} _map[_type]['properties'] = {} _map[_type]['properties']['last_update_date'] = {} _map[_type]['properties']['last_update_date']['type'] = 'date' _map[_type]['properties']['last_update_date'][ 'format'] = 'dateOptionalTime' es = ElasticSearch(connection_string) try: es.create_index(process_index) logging.info('Creating index %s', process_index) except exceptions.IndexAlreadyExistsError as e: logging.info('%s already exists', process_index) try: es.put_mapping(process_index, doc_type=_type, mapping=_map) logging.info('Successfully created mapping') except: logging.fatal('Could not create the mapping') new_doc = {} new_doc['last_update_date'] = timestamp es.index(process_index, doc_type=_type, id=doc_id, doc=new_doc, overwrite_existing=True)
def main(): #Train the Naive Bayes Classifier f=open('./data_set/naivebayes_trained_model.pickle') NBClassifier=pickle.load(f) #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index and store #the contents of the tweet file in that Index es=ElasticSearch('http://localhost:9200/') es_indexer() ############Indexing into Elasticsearch############ i=0 for each in tweet_data(): i+=1 testTweet= each processedTestTweet=process_tweet(testTweet) sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet))) es.index("sentiment_analysis","document",{ "text": testTweet, "sentiment": sentiment },id=i) print "Indexing completed." es.refresh(index="sentiment_analysis") print "Index refreshed." f.close()
class ESPipeline(object): def __init__(self, *args, **kwargs): self.client = ElasticSearch('http://localhost:9200/') def process_item(self, item, spider): self.client.index('wiki', 'page', dict(item)) return item
class ItvacaturesParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "it-vacatures" # elasticsearch binden aan es self.es = ElasticSearch("http://localhost:9200/") def parseTitel(self, soup): titel = soup.head.title.string return titel def parseWerkgever(self, soup): info = soup.find("td") infoTwee = info.find_next_sibling() p = re.compile(r"<.*?>") werkgever = p.sub("", str(infoTwee)) return werkgever def parseLocatie(self, soup): info = soup.find("td") infoTwee = info.find_next_sibling() locatieEen = infoTwee.find_next() p = re.compile(r"<.*?>") locatieTwee = p.sub("", str(locatieEen)) p = re.compile(r"Locatie") locatie = p.sub("", str(locatieTwee)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find("div", {"id": "job-description"}) p = re.compile(r"<.*?>") inhoud = p.sub("", str(body)) return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) # parsen titel = self.parseTitel(soup) try: werkgever = self.parseWerkgever(soup) except: werkgever = "-" try: locatie = self.parseLocatie(soup) except: locatie = "-" inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r"(?s)/\*.*\*/", "", websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id (string) id = self.website + "-" + re.sub(r"\W+", "", titel) # make document to be send to elasticsearch database document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) # indexeren (stoppen) van vacaturen in esDb self.es.index("vacature-index", "vacature", document, id=document["id"]) print "Es: " + titel
class IitjobsParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "iitjobs" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseTitel(self, soup): titel = soup.head.title.string titel = titel.strip() return titel def parseWerkgever(self, soup): body = soup.find( "span", {"id": "ctl00_middleContent_idShowJobDetails_lblCompanyName"}) p = re.compile(r'<.*?>') werkgever = p.sub('', str(body)) werkgever = werkgever.strip() return werkgever def parseLocatie(self, soup): body = soup.find( "span", {"id": "ctl00_middleContent_idShowJobDetails_lblCountryID"}) p = re.compile(r'<.*?>') locatie = p.sub('', str(body)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find("div", {"id": "divJobDescrip"}) p = re.compile(r'<.*?>') inhoud = p.sub('', str(body)) inhoud = inhoud.strip() return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) #parsen titel = self.parseTitel(soup) werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id for website (string) id = self.website + "-" + re.sub(r'\W+', '', titel) # make document to be send to elasticsearch database document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print('Es: ' + titel)
class IctergezochtParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "ictergezocht" # elasticsearch binden aan es self.es = ElasticSearch("http://localhost:9200/") def parseWerkgever(self, soup): info = soup.find(class_="highlight") p = re.compile(r"<.*?>") werkgever = p.sub("", str(info)) return werkgever def parseLocatie(self, soup): infoTwee = soup.find(class_="bf") locatieEen = infoTwee.find_next() locatieTwee = locatieEen.find_next() locatieDrie = locatieTwee.find_next() locatieVier = locatieDrie.find_next() p = re.compile(r"<.*?>") locatieVijf = p.sub("", str(locatieVier)) p = re.compile(r"Locatie") locatie = p.sub("", str(locatieVijf)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find(class_="vacancybody") p = re.compile(r"<.*?>") inhoud = p.sub("", str(body)) return inhoud def parseTitel(self, soup): titel = soup.head.title.string return titel def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) titel = self.parseTitel(soup) if titel.startswith("Vacature"): # parsen werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r"(?s)/\*.*\*/", "", websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id website (string) id = self.website + "-" + re.sub(r"\W+", "", titel) # make document document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) # indexeren (stoppen) van vacaturen in esDb self.es.index("vacature-index", "vacature", document, id=document["id"]) print "Es: " + titel
class IctergezochtParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "ictergezocht" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseWerkgever(self, soup): info = soup.find(class_="highlight") p = re.compile(r'<.*?>') werkgever = p.sub('', str(info)) return werkgever def parseLocatie(self, soup): infoTwee = soup.find(class_="bf") locatieEen = infoTwee.find_next() locatieTwee = locatieEen.find_next() locatieDrie = locatieTwee.find_next() locatieVier = locatieDrie.find_next() p = re.compile(r'<.*?>') locatieVijf = p.sub('', str(locatieVier)) p = re.compile(r'Locatie') locatie = p.sub('', str(locatieVijf)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find(class_="vacancybody") p = re.compile(r'<.*?>') inhoud = p.sub('', str(body)) return inhoud def parseTitel(self, soup): titel = soup.head.title.string return titel def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) titel = self.parseTitel(soup) if titel.startswith("Vacature"): #parsen werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id website (string) id = self.website + "-" + re.sub(r'\W+', '', titel) #make document document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print "Es: " + titel
def dump_one_and_one_post_elasticsearch(token): es = ElasticSearch('http://localhost:9200/') relevant_posts = [] for element in Newsfeed.newsfeed(token, [], 0, None, 1000): if 'from' in element and 'category' in element['from']: continue post = Post(element, token) es.index(token.lower(), "post", post.serialize())
class IitjobsParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "iitjobs" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseTitel(self, soup): titel = soup.head.title.string titel = titel.strip() return titel def parseWerkgever(self, soup): body = soup.find("span", {"id": "ctl00_middleContent_idShowJobDetails_lblCompanyName"}) p = re.compile(r'<.*?>') werkgever = p.sub('', str(body)) werkgever = werkgever.strip() return werkgever def parseLocatie(self, soup): body = soup.find("span", {"id": "ctl00_middleContent_idShowJobDetails_lblCountryID"}) p = re.compile(r'<.*?>') locatie = p.sub('', str(body)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find("div", {"id": "divJobDescrip"}) p = re.compile(r'<.*?>') inhoud = p.sub('', str(body)) inhoud = inhoud.strip() return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) #parsen titel = self.parseTitel(soup) werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id for website (string) id = self.website + "-" + re.sub(r'\W+', '', titel) # make document to be send to elasticsearch database document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print('Es: ' + titel)
class ElasticPush(Handler): """Posts events to ES.""" def __init__(self, host='localhost', dest=[]): Handler.__init__(self, dest=dest) self.es = ElasticSearch('http://%s:9200/' % (host)) self.source_host = socket.gethostname() def push(self, data): self.debug("Pushing data %s to elastic search" % (data)) event = ElasticEvent(data) self.es.index("carmanor", "line", event.dict()) Handler.push(self, data)
def dump_relevant_newsfeed_to_elasticsearch(token): es = ElasticSearch('http://localhost:9200/') relevant_posts = [] for element in Newsfeed.newsfeed(token, [], 0, None, 1000): if 'from' in element and 'category' in element['from']: continue post = Post(element, token) relevant_posts.append(post.serialize()) data = {'posts': relevant_posts} es.index(token.lower(), "post", data, id=1)
def resource_create(request, index): data = json.loads(request.POST['data']) data['new'] = True # random identifier data['uri'] = ''.join(random.choice('0123456789ABCDEF') for i in range(32)) # store data in elasticsearch es = ElasticSearch(settings.ELASTICSEARCH_URL) if index == 'persons': es.index(index, "person", data) elif index == 'institutes': es.index(index, "institute", data) es.refresh(index) return JsonResponse(data)
def main(): # url = u'https://blog.gslin.org/archives/2015/01/22/5548/backblaze-%E5%85%AC%E4%BD%88%E7%A1%AC%E7%A2%9F%E6%95%85%E9%9A%9C%E7%8E%87/' url = u'http://yychen.joba.cc/dev/archives/164' es = ElasticSearch(HOST) for i in range(20): item, url = get_page(url) if not url: print '\033[1;33mWe\'ve reached the end, breaking...\033[m' break # put it into es print 'Indexing \033[1;37m%s\033[m (%s)...' % (item['title'], item['url']) es.index(INDEX, DOCTYPE, doc=item, id=item['url'])
class WeatherDatabase(object): def __init__(self, server='http://0.0.0.0:9901'): self.server = server self.es = ElasticSearch(server) def index(self, data): return self.es.index('weather', 'sensor', data)
class SensorDatabase(object): def __init__(self, server='http://0.0.0.0:9901'): self.server = server self.es = ElasticSearch(server) def index(self, sensor_id, data): return self.es.index('domotic', 'sensor_values', data)
def test_cluster_size_3(self): cluster = self._make_one(size=3) cluster.start() self.assertEqual(len(cluster), 3) self.assertEqual(len(cluster.hosts), 3) self.assertEqual(len(os.listdir(cluster.working_path)), 3) self.assertEqual(len(cluster.urls), 3) client = ElasticSearch(cluster.urls, max_retries=2) self.assertEqual(client.health()['number_of_nodes'], 3) # test if routing works and data is actually distributed across nodes client.create_index('test_shards', settings={ 'number_of_shards': 1, 'number_of_replicas': 2, }) client.index('test_shards', 'spam', {'eggs': 'bacon'}) client.refresh('test_shards') shard_info = client.status()['indices']['test_shards']['shards']['0'] nodes = set([s['routing']['node'] for s in shard_info]) self.assertTrue(len(nodes) > 1)
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_parts': part}} else: query = {'match_all': {}} query = { 'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query } notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
class ElasticSearchPublisher(Publisher): ''' Publishes to an ElasticSearch Index ''' def __init__(self, elasticsearch_url, index_name, **kwargs): self.elasticsearch_url = elasticsearch_url self.index_name = index_name self.connection = ElasticSearch(self.elasticsearch_url) def publish(self, event, message, event_id): response = self.connection.index(self.index_name, event, message, id=event_id) return response
def load_wordcloud( filename, index_name, type_name, category, name, zone="France", sep=";", display="wordcloud", source="", description="", ): f = open(filename, mode="r") es = ElasticSearch(CONTEXT["datahub-store"]) categories = [] for line in f: key, string_value = line.split(sep, 2) value = cjson.decode(string_value) categories.append((value["label"], value["norm_count"] / 100.0)) serie = { "name": name, "owner": "public", "display": display, "zone": zone, "category": category, "source": source, "description": unicode(description) % (name), "data": { "categories": map(lambda item: item[0], categories), "series": [{"data": map(lambda item: item[1], categories)}], }, } es.index(index_name, display, serie) es.refresh(index_name) f.close()
def indexLookup(self): es = ElasticSearch('http://104.236.54.204:9200') with open("author_lookup.json", "r") as f: lookups = json.loads(f.read()) for lookup in lookups: doc = {} doc['id'] = lookup titles = [] for x in lookups[lookup]: print (x) t = str(x[0]) + "|" + x[1] titles.append(t) doc['titles'] = titles try: es.index(index="titles", doc_type="title", id=lookup, doc=doc) except exceptions.ElasticHttpError as e: print("Error on this one") print(doc["id"]) print(str(e)) print (lookup)
def indexAuthors(self): es = ElasticSearch('http://localhost:9200') with open("allData.json", "r") as f: authors = json.loads(f.read()) for author in authors: #i don't want to add the about as a topic right now authors[author]['groups']['about'] = [] print ("Doing", author) try: es.index(index="gutenberg", doc_type="author", id=authors[author]['id'], doc=authors[author]) except exceptions.ElasticHttpError as e: print ("-----------------") print ("Error indexing this author:",author) print (e) print ("-----------------")
class ElasticConnector(Connector): """ Class for connectors that are operate with elasticsearch database """ MAX_SIZE = 1000 def __init__(self, database, host='http://localhost:9200/'): self.client = ElasticSearch(host) self.index = database self.create_index() def query_to_id(self, query): """ Returns id representation of a specified query This is a temporary method as a replacement of elasticsearch query search """ return "_".join(str(k) + "_" + str(v) for k, v in query.items()).replace("/", "_") def create_index(self): """ Creates specified index or catches an exception if it has already been created """ try: self.client.create_index(self.index) except Exception as e: pass def set_dynamic_mapping(self, collection): """ Sets dynamic mapping for a specified document type """ self.client.put_mapping(self.index, collection, {'dynamic': True}) def save_block(self, block): """ Saves operation info in a database """ super().save_block(block) collection = block.get_collection() dictionary = block.to_dict() query = block.get_query() self.update_by_query(collection, query, block) def update_by_query(self, collection, query, document): """ Sets dynamic mapping for a specified collection, then creates a new id for a document depending on query for it. Saves a new object in a database as a new one """ try: self.set_dynamic_mapping(collection) document_id = document.get_id() document_body = document.to_dict() if "_id" in document_body.keys(): del document_body['_id'] self.client.index(self.index, collection, document_body, id=self.query_to_id(query)) except Exception as e: print(e) pass def find_last_block(self): """ Finds last block index as a value field of a document in a status collection with specified id """ try: document = self.client.get(self.index, 'status', 'height_all_tsx')['_source'] return document['value'] except ElasticHttpNotFoundError as e: return 0 def update_last_block(self, last_block): """ Updates last block index as a value field of a document in a status collection with specified id """ self.client.index(self.index, 'status', {'value': last_block}, id='height_all_tsx') def save_instance(self, instance): """ Saves account or comment object """ self.update_by_query(instance.get_collection(), instance.get_query(), instance) def get_instances_to_update(self, collection): """ Finds and returns all dictionaries with objects that should be updated """ hits = self.client.search("need_update:true", index=self.index, doc_type=collection, size=self.MAX_SIZE)['hits']['hits'] return [{**hit['_source'], **{"_id": hit["_id"]}} for hit in hits] def update_instances(self, collection, instances): """ Resets need_update flag for all instances in a list by their ids in _id field """ for instance in instances: self.client.update(self.index, collection, instance["_id"], doc={'need_update': False})
except ValueError: return False def isflt(string): try: float(string) return True except ValueError: return False for item in root.findall('artikel'): list.append(articles) articles = {} for subitem in item: if subitem.text is not None: if isint(subitem.text): articles[subitem.tag] = int(subitem.text) elif isflt(subitem.text): articles[subitem.tag] = float(subitem.text) else: articles[subitem.tag] = subitem.text for i, article in enumerate(list): if len(article) == 0: print "Empty value found" else: print es.index('articles3', 'article', article, id=i)
try: es.create_index('recast') except IndexAlreadyExistsError, e: pass r = requests.get(ELASTIC_SEARCH_URL) i=1 while r.status_code == 200: url = 'http://recast-rest-api.herokuapp.com/analysis/{}'.format(i) r = requests.get(url) if not r.status_code == 200: break data = cleanJson(r.content) es.index('recast', 'analysis', json.dumps(data)) i = i+1 r = requests.get(ELASTIC_SEARCH_URL) i=1 while r.status_code == 200: url = 'http://recast-rest-api.herokuapp.com/requests/{}'.format(i) r = requests.get(url) if not r.status_code == 200: break data = cleanJson(r.content) es.index('recast', 'requests', json.dumps(data)) i = i+1
class Elastic(DataLayer): """ElasticSearch data layer.""" serializers = { 'integer': int, 'datetime': parse_date } def init_app(self, app): app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') app.config.setdefault('ELASTICSEARCH_INDEX', 'eve') self.es = ElasticSearch(app.config['ELASTICSEARCH_URL']) self.index = app.config['ELASTICSEARCH_INDEX'] def _get_field_mapping(self, schema): """Get mapping for given field schema.""" if schema['type'] == 'datetime': return {'type': 'date'} elif schema['type'] == 'string' and schema.get('unique'): return {'type': 'string', 'index': 'not_analyzed'} elif schema['type'] == 'string': return {'type': 'string'} def put_mapping(self, app): """Put mapping for elasticsearch for current schema. It's not called automatically now, but rather left for user to call it whenever it makes sense. """ for resource, resource_config in app.config['DOMAIN'].items(): properties = {} properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'}) properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'}) for field, schema in resource_config['schema'].items(): field_mapping = self._get_field_mapping(schema) if field_mapping: properties[field] = field_mapping datasource = (resource, ) # TODO: config.SOURCES not available yet (self._datasource_ex(resource)) mapping = {} mapping[datasource[0]] = {'properties': properties} self.es.put_mapping(self.index, datasource[0], mapping) def find(self, resource, req, sub_resource_lookup): """ TODO: implement sub_resource_lookup """ query = { 'query': { 'query_string': { 'query': request.args.get('q', '*'), 'default_field': request.args.get('df', '_all'), 'default_operator': 'AND' } } } if not req.sort and self._default_sort(resource): req.sort = self._default_sort(resource) # skip sorting when there is a query to use score if req.sort and 'q' not in request.args: query['sort'] = [] sort = ast.literal_eval(req.sort) for (key, sortdir) in sort: sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')]) query['sort'].append(sort_dict) if req.where: where = json.loads(req.where) if where: query['filter'] = { 'term': where } if req.max_results: query['size'] = req.max_results if req.page > 1: query['from'] = (req.page - 1) * req.max_results source_config = config.SOURCES[resource] if 'facets' in source_config: query['facets'] = source_config['facets'] try: args = self._es_args(resource) args['es_fiels'] = self._fields(resource) return self._parse_hits(self.es.search(query, **args), resource) except es_exceptions.ElasticHttpError: return ElasticCursor() def find_one(self, resource, **lookup): args = self._es_args(resource) args['es_fields'] = self._fields(resource) if config.ID_FIELD in lookup: try: hit = self.es.get(id=lookup[config.ID_FIELD], **args) except es_exceptions.ElasticHttpNotFoundError: return if not hit['exists']: return doc = hit.get('fields', hit.get('_source', {})) doc['_id'] = hit.get('_id') convert_dates(doc, self._dates(resource)) return doc else: query = { 'query': { 'constant_score': { 'filter': { 'term': lookup } } } } try: args['size'] = 1 docs = self._parse_hits(self.es.search(query, **args), resource) return docs.first() except es_exceptions.ElasticHttpNotFoundError: return None def find_list_of_ids(self, resource, ids, client_projection=None): args = self._es_args(resource) args['es_fields'] = self._fields(resource) return self._parse_hits(self.es.multi_get(ids, **args), resource) def insert(self, resource, doc_or_docs, **kwargs): ids = [] kwargs.update(self._es_args(resource)) for doc in doc_or_docs: doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs)) ids.append(doc['_id']) self.es.refresh(self.index) return ids def update(self, resource, id_, updates): args = self._es_args(resource, refresh=True) return self.es.update(id=id_, doc=updates, **args) def replace(self, resource, id_, document): args = self._es_args(resource, refresh=True) args['overwrite_existing'] = True return self.es.index(document=document, id=id_, **args) def remove(self, resource, id_=None): args = self._es_args(resource, refresh=True) if id_: return self.es.delete(id=id_, **args) else: try: return self.es.delete_all(**args) except es_exceptions.ElasticHttpNotFoundError: return def _parse_hits(self, hits, resource): """Parse hits response into documents.""" return ElasticCursor(hits, self._dates(resource)) def _es_args(self, resource, refresh=None): """Get index and doctype args.""" datasource = self._datasource(resource) args = { 'index': self.index, 'doc_type': datasource[0], } if refresh: args['refresh'] = refresh return args def _fields(self, resource): """Get projection fields for given resource.""" datasource = self._datasource(resource) keys = datasource[2].keys() return ','.join(keys) def _default_sort(self, resource): datasource = self._datasource(resource) return datasource[3] def _dates(self, resource): dates = [config.LAST_UPDATED, config.DATE_CREATED] datasource = self._datasource(resource) schema = config.DOMAIN[datasource[0]]['schema'] for field, field_schema in schema.items(): if field_schema['type'] == 'datetime': dates.append(field) return dates
# -*- coding: utf-8 -*- import codecs import json import re from pyelasticsearch import ElasticSearch es = ElasticSearch('http://localhost:9200/') file = codecs.open('shop_data.json', mode='r', encoding='utf-8') index = 0 for line in file.readlines(): data = json.loads(line) data.pop("id") data["shop_tel"] = re.sub(" +", ",", data["shop_tel"]) data["shop_tel"] = data["shop_tel"].encode("utf8").replace("电话:", "").split(",")[1:] data["location"] = re.sub(" +", ",", data["location"]).split(",") data["location"] = data["location"][1] + "," + data["location"][0] data["shop_tags"] = re.sub("\(\d+\)", "", data["shop_tags"]) data["shop_tags"] = re.sub(" +", ",", data["shop_tags"]) data["shop_tags"] = data["shop_tags"].encode("utf8").replace("分类标签:,", "").split(",")[:-1] data["open_time"] = re.sub(" +", "", data["open_time"]) data["open_time"] = data["open_time"].encode("utf8").replace("营业时间:", "").replace("添加", "").replace("修改", "").replace(":",":") index += 1 es.index('dianping', 'food', data, id=index)
"logo": crawled_data.get("logo"), "twitter": crawled_data.get("twitter"), "station_site": crawled_data.get("station_site"), "primary_genre": crawled_data.get("primary_genre"), "frequency": crawled_data.get("frequency"), "shoutcast_url": crawled_data.get("shoutcast_url"), } # TODO: get lat, lon if hasattr(settings, 'GEONAMES_USER') and settings.GEONAMES_USER != "demo": params = { "name_equals": index_data["city"], "country": index_data["country"], "adminCode1": index_data["state"], "maxRows": 10, "lang": "en", "username": settings.GEONAMES_USER, "style": "medium" } geo_request = requests.get("http://api.geonames.org/searchJSON", params=params) geonames = geo_request.json().get("geonames", []) if geonames: index_data["location"] = { "lat": float(geonames[0]["lat"]), "lon": float(geonames[0]["lng"]) } es.index(INDEX_NAME, 'station', index_data, id=crawled_data.get('id')) print("Bailed after %d failures (pk %d)" % (failures, pk))
def save(self, force_insert=False, force_update=False, **kwargs): es = ElasticSearch(ELASTIC_SEARCH_URL) if self.id: location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.update('glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id), script="ctx._source.listing_id = listing;" + "ctx._source.artist_id = artist;" + "ctx._source.artist_avatar = artist_avatar;" + "ctx._source.title = title;" + "ctx._source.location = location;" + "ctx._source.description = description;" + "ctx._source.get_picture = get_picture;" + "ctx._source.metadata = metadata;" + "ctx._source.price = price;" + "ctx._source.likes = likes;" + "ctx._source.comments = comments;" + "ctx._source.tags = tags;" + "ctx._source.status = status;" + "ctx._source.style = style;" + "ctx._source.rating = rating", params={ 'listing':self.id, 'artist':self.get_artist_id(), 'artist_avatar':self.get_artist_avatar(), 'title':self.title, 'location':location_es, 'description':self.description, 'get_picture':self.get_picture(), 'metadata':self.metadata, 'price':self.price, 'likes':self.likes, 'comments':self.comments, 'tags':self.get_tags(), 'status':self.status, 'style':self.get_style(), 'rating':self.get_rating() }) super(Listing, self).save(force_insert, force_update) else: super(Listing, self).save(force_insert, force_update) artist_user = self.artist.user artist_name = artist_user.first_name followers = Followers.objects.select_related().filter(artist=self.artist) for follower in followers: Notification.objects.create( sender = artist_user, receiver = follower.user, time = current_time(), short_text = NOTIFICATIONS_SHORT[10].format(artist=artist_name), long_text = NOTIFICATIONS_LONG[10].format(artist=artist_name, listing=self.title, user_id=self.artist_id, metadata=self.id), ) location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.index('glamazer', 'modelresult', { 'listing_id': self.id, 'artist_id': self.artist_id, 'artist_avatar':self.get_artist_avatar(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'price': self.price, 'likes': self.likes, 'comments':self.comments, 'tags': self.get_tags(), 'status':self.status, 'style':self.get_style(), 'rating':self.get_rating() }, id='listings.listing.{0}'.format(self.id)) es.refresh('glamazer')
def save(self, force_insert=False, force_update=False, **kwargs): es = ElasticSearch(ELASTIC_SEARCH_URL) if self.id: location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.update( 'glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id), script="ctx._source.listing_id = listing;" + "ctx._source.artist_id = artist;" + "ctx._source.artist_avatar = artist_avatar;" + "ctx._source.artist_name = artist_name;" + "ctx._source.salon_id = salon;" + "ctx._source.salon_avatar = salon_avatar;" + "ctx._source.salon_name = salon_name;" + "ctx._source.title = title;" + "ctx._source.location = location;" + "ctx._source.description = description;" + "ctx._source.get_picture = get_picture;" + "ctx._source.metadata = metadata;" + "ctx._source.gender = gender;" + "ctx._source.price = price;" + "ctx._source.currency = currency;" + "ctx._source.likes = likes;" + "ctx._source.comments = comments;" + "ctx._source.tags = tags;" + "ctx._source.status = status;" + "ctx._source.style = style;" + "ctx._source.rating = rating", params={ 'listing': self.id, 'artist': self.get_artist_id(), 'artist_avatar': self.get_artist_avatar(), 'artist_name': self.get_artist_name(), 'salon': self.get_salon_id(), 'salon_avatar': self.get_salon_avatar(), 'salon_name': self.get_salon_name(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'gender': self.gender, 'price': self.price, 'currency': self.currency, 'likes': self.likes, 'comments': self.comments, 'tags': self.get_tags(), 'status': self.status, 'style': self.get_style(), 'rating': self.get_rating() }) super(Listing, self).save(force_insert, force_update) else: super(Listing, self).save(force_insert, force_update) location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.index('glamazer', 'modelresult', { 'listing_id': self.id, 'artist_id': self.artist_id, 'artist_avatar': self.get_artist_avatar(), 'artist_name': self.get_artist_name(), 'salon_id': self.get_salon_id(), 'salon_avatar': self.get_salon_avatar(), 'salon_name': self.get_salon_name(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'gender': self.gender, 'price': self.price, 'currency': self.currency, 'likes': self.likes, 'comments': self.comments, 'tags': self.get_tags(), 'status': self.status, 'style': self.get_style(), 'rating': self.get_rating() }, id='listings.listing.{0}'.format(self.id)) es.refresh('glamazer')
import json from bson import json_util from pyelasticsearch import ElasticSearch,bulk_chunks from pymongo import MongoClient conn = MongoClient() # defaults to localhost db = conn.sci tweetsdb = db['focal'] elastic = ElasticSearch('http://localhost:9200') elastic.delete_all_indexes() # This would be all around better and faster with the bulk API, # but for the life of me I can't make bulk take the # json output. i = 0 for tweet in tweetsdb.find(): if i % 1000 == 0: print i elastic.index('db','tweets',json.dumps(tweet,default=json_util.default)) i += 1 print 'Records written successfully!'
class BonqueParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "bonque" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseTitel(self, soup): titel = soup.head.title.string return titel def parseWerkgever(self, soup): info = soup.find(class_="info") p = re.compile(r'<.*?>') infoText = p.sub('', str(info)) p2 = re.compile(r'Werkgever ') werkgeverText = p2.sub('', infoText) p3 = re.compile(r'Locatie.*') werkgever = p3.sub('', werkgeverText) werkgever = werkgever.strip() return werkgever def parseLocatie(self, soup): info = soup.find(class_="info") p = re.compile(r'<.*?>') infoText = p.sub('', str(info)) p2 = re.compile(r'Werkgever ') werkgeverText = p2.sub('', infoText) p4 = re.compile(r'(?s).*?Locatie ') locatie = p4.sub('', werkgeverText) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.findAll('p') inhoud = "" for i in body: text = i.text text = re.sub('\'', '', text) text = text.strip() inhoud = inhoud + text.encode('utf8') return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) #parsen titel = self.parseTitel(soup) werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") #make id (str) id = self.website + "-" + re.sub(r'\W+', '', titel) # make document for elasticsearch db document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print "Es: " + titel
os.chmod(newfile, 0644) filestring = 'Attachment: <a href="%sattachments/%s">%s</a><br>\n' % (baseurl, newshort, newshort) load += filestring else: load = msg.get_payload() payload = load.rstrip("\n") # attachment strings aren't sanitized, so we're doing this to keep them from breaking anything payload = payload.replace("'", "'") # write the message to elasticsearch if not debugmode: x = {'submitted': subdate, 'sent': msgdate, 'from': msgfrom, 'subject': msgsubject, 'body': payload} es = ElasticSearch(elasticsearch) print json.dumps(x) es.index("oplog", queue, x) if txtlog == 'T': # write the message to the text log # we're reconstructing variables here because the formatting is different logsubject = msg['Subject'] logfrom = msg['From'].replace("'", "") logpayload = load.rstrip("\n") logrecord = """System date: %s\n Message date: %s\n Message from: %s\n Message subject: %s\n Message body:\n %s\n\n <----------------END MESSAGE ------------------------>\n\n""" % (recdate, msgdate, logfrom, logsubject, logpayload) degbu(logrecord)
class Worker(Process): def __init__(self, queue, number=-1): self.__queue = queue self.number = number self.name = "index_plays worker #%d" % number self.es = ElasticSearch(settings.ES_URL) Process.__init__(self) def parse_metadata(self, metadata): stream_info = metadata.split(";") for info in metadata.split(";"): key = info[:info.index('=')] value = info[info.index('=') + 1:] if key == 'StreamTitle': info = value[1:-1] def run(self): while 1: item = self.__queue.get() if item is None: break station_id, shoutcast_url, last_playing, last_playing_time = item try: r = requests.get(shoutcast_url, headers={'Icy-Metadata': '1'}, stream=True) except requests.exceptions.ConnectionError: continue # Parse the headers headers = {} line = "" for content in r.iter_content(): line += content if line[-2:] == '\r\n': # Line ended if ":" in line: key = line[:line.index(":")] value = line[line.index(":") + 1:-2] headers[key] = value if len(line) == 2: break line = "" # We really need the metaint, so that we know where to look for metadata if 'icy-metaint' not in headers: print("No icy-metaint!") continue metaint = int(headers.get('icy-metaint')) data = r.raw.read(metaint) length = r.raw.read(1) if len(length) != 1: continue # It seems like sometimes it's getting stuck here. length = struct.unpack('B', length)[0] metadata = r.raw.read(length * 16) r.close() # Now we've got the metadata string! if metadata != last_playing: stream_info = metadata.split(";") for info in metadata.split(";"): try: split_index = info.index('=') except ValueError: continue key = info[:split_index] value = info[split_index + 1:] if key == 'StreamTitle': s = StreamTitle(value) if s.is_song(): print("[worker %s] %s : %s" % (self.number, s.description, s.data.get('text'))) doc = s.data doc['description'] = s.description self.es.index(settings.ES_INDEX, 'play', doc, parent=station_id) last_playing_time = datetime.datetime.now() if last_playing_time is None: last_playing_time = datetime.datetime.now() # A valid station should be playing a song at least once every 30 minutes. if last_playing_time > (datetime.datetime.now() - datetime.timedelta(minutes=30)): # Send it around again.... self.__queue.put( (station_id, shoutcast_url, metadata, last_playing_time))
class LBRest(): def __init__(self, base=None, idx_exp_url=None, txt_mapping=None, cfg_idx=None): """Serve para cosumir o LBG e o ES.""" self.base = base self.idx_exp_url = idx_exp_url if self.idx_exp_url is not None: self.idx_exp_host = idx_exp_url.split('/')[2] self.idx_exp_index = idx_exp_url.split('/')[3] self.idx_exp_type = idx_exp_url.split('/')[4] self.es = ElasticSearch("http://" + self.idx_exp_host) self.txt_mapping = txt_mapping self.cfg_idx = cfg_idx self.con_refsd = False def get_index(self, bases_list): """Obter a a configuração de indexação p/ as bases.""" bases_indexes = [] for base in bases_list: idx_exp_url = base['metadata']['idx_exp_url'] nm_idx = idx_exp_url.split('/')[3] url_txt_idx = config.REST_URL + "/_txt_idx/" + nm_idx req = None try: req = requests.get(url_txt_idx) req.raise_for_status() idx_resp = req.json() except requests.exceptions.HTTPError as e: if e.response.status_code == 404: # NOTE: Para os casos onde não há configuração de # indexação setada na rota "_txt_idx"! By Questor idx_resp = None else: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error("Falha HTTP ao tentar obter configuração de "\ "índice textual! URL: %s. FALHA: %s" % (config.REST_URL, fail_content)) return [] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error("Erro ao tentar obter a configuração de índice "\ "textual! URL: %s. FALHA: %s" % (config.REST_URL, fail_content)) return [] bases_indexes.append({"base": base, "index": idx_resp}) return bases_indexes def get_bases(self): """Get all bases which has to index registries.""" # NOTE: A construção logo abaixo tá meio tosca. O objetivo é # checar se na estrutura de dados da table "lb_base" já está # o atributo (campo struct) e o campo "txt_mapping". Se não # tiver, tenta obter a base com todos os campos. Trata-se de # um "workaround" sendo o correto que a estrutura de dados # na table "lb_base" esteja atualizada! By Questor bases = [ ] req = None try: params = """{ "select": [ "name", "idx_exp_time", "idx_exp_url", "txt_mapping" ], "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) if config.FORCE_INDEX == True: data = [ ] results = dict({ u'metadata' : { u'idx_exp_url' : u''+config.ES_URL+'', u'name' : u''+config.NM_BASE+'', u'idx_exp_time' : u''+config.TIME_IDX+'' } }) data.append(results) bases = data else: req.raise_for_status() response = req.json() bases = response["results"] except Exception as e: bases = [ ] req = None try: params = """{ "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) req.raise_for_status() response = req.json() bases = response["results"] except Exception as e: # NOTE: A variável de instância "self.con_refsd" # serve p/ evitar que o aviso mais abaixo seja # exibido repetidamente detonando o log! By Questor if self.con_refsd: return bases # NOTE: Estou usando '"Connection refused" in str(e)' # pq "raise_for_status()" mais acima não retorna uma # exceção do tipo "requests.exceptions.HTTPError" de # forma q possamos usar o código em "status_code" # tratar erro de forma mais específica! By Questor if "Connection refused" in str(e) and not self.con_refsd: logger.error('Erro ao obter a lista bases para '\ 'indexação. URL: %s. FALHA: Servidor indisponivel! '\ 'HTTPCode: 502 (Connection refused)!' % (config.REST_URL)) self.con_refsd = True return bases self.con_refsd = False fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error( ("Erro ao obter a lista bases para indexação. " "URL: %s. FALHA: %s") % ( config.REST_URL, fail_content)) return bases def get_passed_registries(self): """Retorna registros da base de log erros de indexação. Apenas "id_doc_orig" e "dt_last_up_orig". """ # NOTE: Cria base de log se não existir! By Questor self.create_log_base() registries = [ ] params = {'$$':"""{ "select":["id_doc_orig", "dt_last_up_orig"], "literal": "nm_base = '%s'", "limit": null }""" % self.base} url = config.REST_URL + '/log_lbindex/doc' req = None try: req = requests.get(url, params=params) req.raise_for_status() response = req.json() registries = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" 1 Erro ao recuperar registros da base %s'. FALHA: %s """ % ('log_lbindex', fail_content)) resp = {} for reg in registries: resp[reg['id_doc_orig']] = reg['dt_last_up_orig'] return resp def get_registries(self): """Retorna registros à serem indexados que sob certos critérios não tenham falhado no passado. """ # NOTE: Obtêm registros da base de log de erros! Registros # q tenham falhado no passado! By Questor registries = [ ] if config.FORCE_INDEX: params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'} else: params = { '$$':'{"select":["id_doc", "dt_last_up"], \ "literal":"dt_idx is null", "limit": %d}' } params.update(result_count='false') params['$$'] = params['$$'] % config.DEFAULT_LIMIT url = config.REST_URL + '/' + self.base + '/doc' req = None try: req = requests.get(url, params=params) req.raise_for_status() response = req.json() registries = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" Erro ao recuperar registros da base %s'. FALHA: %s """ % (self.base, fail_content)) ''' TODO: Essa lógica poderia ser mais eficiente... A princípio vejo duas soluções... 1 - Guardar em cache (mais complicada); 2 - Trazer apenas os registros (id_doc) envolvidos no processo de indexação atual. By Questor ''' ''' TODO: Esse método "self.get_passed_registries()" deveria ser chamado sempre? Mesmo quando a operação é "create"? Checar melhor... By Questor ''' # NOTE: Obtêm registros da base de log de erros! Registros # q tenham falhado no passado! By Questor passed = self.get_passed_registries() _registries = [ ] for reg in registries: if reg['_metadata']['id_doc'] in passed: ''' NOTE: O objetivo aqui é checar se o registro está no log de erros (registros que tentou-se indexar no passado) e se estiver ignora-os a não ser que a data de "update" do registro registrado na base de logs seja diferente da data atual do registro, nesses casos o LBIndex vai tentar novamente! By Questor ''' ''' NOTE: No dict "passed" consta apenas o valor do campo "dt_last_up_orig" da base "log_lbindex"! By Questor ''' dt_last_up = passed[reg['_metadata']['id_doc']] if dt_last_up != reg['_metadata']['dt_last_up']: _registries.append(reg) else: _registries.append(reg) return _registries def get_full_reg(self, id, dt_last_up): """Obtêm o registro doc mais textos extraídos dos arquivos anexos se houverem. """ # TODO: Registrar essa ação no log toda "santa vez"? By Questor logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base)) response = None url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full' req = None try: req = requests.get(url) req.raise_for_status() response = req.json() except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = """ Erro ao recuperar registro %s na base %s'. FALHA: %s """ % (str(id), self.base, fail_content) # TODO: Pq duas chamadas as logs? By Questor logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return response def es_create_mapping(self): """Cria um mapping p/ uma base se houver configuração p/ isso.""" response_0 = None response_0_json = None index_url = None try: index_url = ("http://" + self.idx_exp_host + "/" + self.idx_exp_index + "/" + self.idx_exp_type) response_0 = requests.get(index_url + "/_mapping") response_0.raise_for_status() response_0_json = response_0.json() except requests.exceptions.HTTPError as e: # NOTE: Normalmente entrará nesse bloco de código # quando o índice não existe! By Questor self.es_create_index() except requests.exceptions.RequestException as e: raise Exception("Problem in the mapping provider! " + str(e)) except Exception as e: raise Exception("Mapping operation. Program error! " + str(e)) if (response_0.status_code == 200 and not response_0_json and (self.txt_mapping is not None and self.txt_mapping)): response_1 = None try: response_1 = self.es.put_mapping( index=self.idx_exp_index, doc_type=self.idx_exp_type, mapping=self.txt_mapping) if (response_1 is None or response_1.get("acknowledged", None) is None or response_1.get("acknowledged", None) != True): raise Exception("Retorno inesperado do servidor \ ao criar mapping! " + str(response_1)) except Exception as e: raise Exception("Mapping creation error! " + str(e)) def es_create_index(self): """Criar um índice p/ a base com as configurações setadas, não havendo criar um índice genérico. """ response_0 = None try: cfg_idx_holder = None # NOTE: Se não houver configuração de indexação "setada" # o sistema vai criar uma padrão! By Questor if self.cfg_idx is not None and self.cfg_idx: cfg_idx_holder = self.cfg_idx else: cfg_idx_holder = { "settings":{ "analysis":{ "analyzer":{ "default":{ "tokenizer":"standard", "filter":[ "lowercase", "asciifolding" ] } } } } } response_0 = self.es.create_index(index=self.idx_exp_index, settings=cfg_idx_holder) if (response_0 is None or response_0.get("acknowledged", None) is None or response_0.get("acknowledged", None) != True): raise Exception("Retorno inesperado do servidor \ ao criar index! " + str(response_0)) self.es_create_mapping() except IndexAlreadyExistsError as e: self.es_create_mapping() except Exception as e: raise Exception("Index creation error! " + str(e)) def index_member(self, registry, id, dt_last_up): """Criar o índice textual para cada registro.""" logger.info( 'Indexando registro %s da base %s na url %s ...' % ( str(id), self.base, self.idx_exp_url)) try: # NOTE: Trata e cria os mappings e index textuais! # By Questor self.es_create_mapping() self.es.index(self.idx_exp_index, self.idx_exp_type, registry, id=id) return True except Exception as e: error_msg = ("Erro ao indexar registro %s da base %s na url %s'. " "Mensagem de erro: %s") % ( str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) # TODO: Pq dois logs? By Questor self.write_error(id, dt_last_up, error_msg) return False def update_dt_index(self, id, dt_last_up): """Atualizar a data de atualização da indexação textual do registro.""" logger.info('Alterando data de indexacao do '\ 'registro %s da base %s ...' % (str(id), self.base)) params = {'value': datetime.datetime.now().\ strftime('%d/%m/%Y %H:%M:%S')} url = (config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx') req = None try: req = requests.put(url, params=params) req.raise_for_status() return True except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = 'Erro ao alterar data de indexacao do registro %s na '\ 'base %s. FALHA: %s' % (str(id), self.base, fail_content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def write_error(self, id_doc, dt_last_up, error_msg): """Write errors to LightBase.""" error = { 'nm_base': self.base, 'id_doc_orig': id_doc, 'error_msg': error_msg, 'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'), 'dt_last_up_orig': dt_last_up } url = config.REST_URL + '/log_lbindex/doc' data = {'value': json.dumps(error)} req = None try: req = requests.post(url, data=data) req.raise_for_status() except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" 0 Erro ao tentar escrever erro no Lightbase. FALHA: %s """ % fail_content) def get_errors(self): """Get all bases which has to index registries.""" errors = [ ] params = """{ "literal": "base = '%s'", "limit": 250 }""" % (self.base) url = config.REST_URL + '/_index_error' req = None try: req = requests.get(url, params={'$$':params}) req.raise_for_status() response = req.json() errors = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" Erro ao tentar recuperar erros de indice. URL: %s. FALHA: %s """ % (url, fail_content)) return errors # TODO: Esse método serve para criar/atualizar p/ uma # indexação (index) padrão! No momento está "desvirtuado", # pois basta apagar o índice p/ q ele seja recriado com a # indexação setada na rota "_txt_idx"! Creio que esse # método não faz muito sentido aqui. Sugiro remover! # By Questor def create_index(self): """Cria índice com as opções de mapeamento padrão Atualiza o índice se já estiver criado. """ settings = { "settings":{ "analysis":{ "analyzer":{ "default":{ "tokenizer":"standard", "filter":[ "lowercase", "asciifolding" ] } } } } } http, space, address, _index, _type = self.idx_exp_url.split('/') try: result = self.es.create_index( index=_index, settings=settings ) except IndexAlreadyExistsError as e: logger.info("O índice já existe. Tentando atualizar o mapping...") self.es.close_index(index=_index) result = self.es.update_settings( index=_index, settings=settings ) logger.info("Mapping atualizado com sucesso. Abrindo o índice...") self.es.open_index(index=_index) logger.info("Índice reaberto com sucesso!") def delete_index(self, registry): """Deletar registros no index.""" id = registry['id_doc'] try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.delete(_index, _type, id=id) return True except ElasticHttpNotFoundError as e: return True except Exception as e: error_msg = 'Erro ao deletar indice %s da base %s na url %s. '\ 'Mensagem de erro: %s' % \ (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) return False def delete_error(self, registry): """Deletar registro de erros na rota '_index_error'.""" url = (config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""") url = url % (registry['base'], registry['id_doc']) logger.info('Deletando registro de erro de indice na url %s' % url) req = None try: req = requests.delete(url) req.raise_for_status() return True except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = """ Erro ao deletar erro de indice. FALHA: %s """ % (fail_content) logger.error(error_msg) return False @staticmethod def create_log_base(): """Cria base de log do LBIndex caso não exista.""" log_base = model.LogBase() response = log_base.get_base() if not response: # NOTE: Cria a base já que ela não existe! logger.info("Criando base de log do índice...") result = log_base.create_base() if result is None: logger.error("Erro na criação da base de log: \n%s", response.text) return False else: logger.info("Base de log criada com sucesso!") return True
class LBRest(): def __init__(self, base=None, idx_exp_url=None): self.base = base self.idx_exp_url = idx_exp_url if self.idx_exp_url is not None: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es = ElasticSearch('/'.join([http, space, address])) def get_bases(self): """ Get all bases which has to index registries """ bases = [ ] params = """{ "select": [ "name", "idx_exp_time", "idx_exp_url" ], "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) try: req.raise_for_status() response = req.json() bases = response["results"] except: logger.error(""" Erro ao tentar recuperar bases. url: %s. Reposta: %s """ % (config.REST_URL, req._content)) return bases def get_passed_registries(self): """ Realiza leitura da base de log de indexação """ # Cria base de log se não existir self.create_log_base() registries = [ ] params = {'$$':"""{ "select":["id_doc_orig", "dt_last_up_orig"], "literal": "nm_base = '%s'", "limit": null }""" % self.base } url = config.REST_URL + '/log_lbindex/doc' req = requests.get(url, params=params) try: req.raise_for_status() response = req.json() registries = response["results"] except: logger.error(""" Erro ao recuperar registros da base %s'. Resposta: %s """ % ('log_lbindex', req._content)) resp = {} for reg in registries: resp[reg['id_doc_orig']] = reg['dt_last_up_orig'] return resp #return {reg['id_doc_orig']: reg['dt_last_up_orig'] for reg in registries} def get_registries(self): """Função que lista todos os registros a serem indexados""" registries = [ ] if config.FORCE_INDEX: params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'} else: params = {'$$':'{"select":["id_doc", "dt_last_up"],"literal":"dt_idx is null", "limit": %d}'} params.update(result_count='false') params['$$'] = params['$$'] % config.DEFAULT_LIMIT url = config.REST_URL + '/' + self.base + '/doc' req = requests.get(url, params=params) try: req.raise_for_status() response = req.json() registries = response["results"] except: logger.error(""" Erro ao recuperar registros da base %s'. Resposta: %s """ % (self.base, req._content)) # Erro ao recuperar registros da base docs_pro'. Resposta: {"status": 500, # "request": {"path": "/api/docs_pro/doc", "client_addr": "10.72.246.21", # "user_agent": "python-requests/2.3.0 CPython/2.6.6 Linux/2.6.32-504.el6.x86_64", # "method": "GET"}, "error_message": "SearchError: (OperationalError) could not # connect to server: No route to host\n\tIs the server running on host \"10.72.247.144\" # and accepting\n\tTCP/IP connections on port 5432?\n None None", "type": "Exception"} passed = self.get_passed_registries() _registries = [ ] for reg in registries: if reg['_metadata']['id_doc'] in passed: dt_last_up = passed[reg['_metadata']['id_doc']] if dt_last_up != reg['_metadata']['dt_last_up']: _registries.append(reg) else: _registries.append(reg) return _registries def get_full_reg(self, id, dt_last_up): logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base)) response = None url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full' req = requests.get(url) try: req.raise_for_status() response = req.json() except: error_msg = """ Erro ao recuperar registro %s na base %s'. Resposta: %s """ % (str(id), self.base, req._content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return response def index_member(self, registry, id, dt_last_up): logger.info('Indexando registro %s da base %s na url %s ...' % (str(id), self.base, self.idx_exp_url)) try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.index(_index, _type, registry, id=id) return True except Exception as e: error_msg = """ Erro ao indexar registro %s da base %s na url %s'. Mensagem de erro: %s """ % (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def update_dt_index(self, id, dt_last_up): logger.info('Alterando data de indexacao do registro %s da base %s ...' % (str(id), self.base)) params = {'value': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')} url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx' req = requests.put(url, params=params) try: req.raise_for_status() return True except: error_msg = """ Erro ao alterar data de indexacao do registro %s na base %s'. Resposta: %s """ % (str(id), self.base, req._content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def write_error(self, id_doc, dt_last_up, error_msg): """ Write errors to LightBase """ error = { 'nm_base': self.base, 'id_doc_orig': id_doc, 'error_msg': error_msg, 'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'), 'dt_last_up_orig': dt_last_up } url = config.REST_URL + '/log_lbindex/doc' data = {'value': json.dumps(error)} req = requests.post(url, data=data) try: req.raise_for_status() except: logger.error(""" Erro ao tentar escrever erro no Lightbase. Reposta: %s """ % req._content) def get_errors(self): """ Get all bases which has to index registries """ errors = [ ] params = """{ "literal": "base = '%s'", "limit": 250 }""" % (self.base) url = config.REST_URL + '/_index_error' req = requests.get(url, params={'$$':params}) try: req.raise_for_status() response = req.json() errors = response["results"] except: logger.error(""" Erro ao tentar recuperar erros de indice. url: %s. Reposta: %s """ % (url, req._content)) return errors def create_index(self): """ Cria índice com as opções de mapeamento padrão Atualiza o índice se já estiver criado """ settings = { "settings": { # "number_of_shards": "5", # "number_of_replicas": "1", "analysis.analyzer.default.filter.0": "lowercase", "analysis.analyzer.default.filter.1": "asciifolding", "analysis.analyzer.default.tokenizer": "standard", "analysis.analyzer.default.type": "custom", "analysis.filter.pt_stemmer.type": "stemmer", "analysis.filter.pt_stemmer.name": "portuguese" }, "mappings": { "document": { "_timestamp": { "enabled": "true" } } } } http, space, address, _index, _type = self.idx_exp_url.split('/') try: result = self.es.create_index( index=_index, settings=settings ) except IndexAlreadyExistsError as e: logger.info("O índice já existe. Tentando atualizar o mapping...") self.es.close_index(index=_index) result = self.es.update_settings( index=_index, settings=settings ) logger.info("Mapping atualizado com sucesso. Abrindo o índice...") self.es.open_index(index=_index) logger.info("Índice reaberto com sucesso!") def delete_index(self, registry): id = registry['id_doc'] try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.delete(_index, _type, id=id) return True except ElasticHttpNotFoundError as e: return True except Exception as e: error_msg = """ Erro ao deletar indice %s da base %s na url %s'. Mensagem de erro: %s """ % (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) return False def delete_error(self, registry): url = config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""" url = url % (registry['base'], registry['id_doc']) logger.info('Deletando registro de erro de indice na url %s' % url) req = requests.delete(url) try: req.raise_for_status() return True except: error_msg = """ Erro ao deletar erro de indice. Resposta: %s """ % (req._content) logger.error(error_msg) return False @staticmethod def create_log_base(): """ Cria base de log do índice caso não exista """ log_base = model.LogBase() response = log_base.get_base() if not response: # Cria a base já que ela não existe logger.info("Criando base de log do índice...") result = log_base.create_base() if result is None: logger.error("Erro na criação da base de log: \n%s", response.text) return False else: logger.info("Base de log criada com sucesso!") return True
return True except ValueError: return False def isflt(string): try: float(string) return True except ValueError: return False for item in root.findall('artikel'): list.append(articles) articles = {} for subitem in item: if subitem.text is not None: if isint(subitem.text): articles[subitem.tag] = int(subitem.text) elif isflt(subitem.text): articles[subitem.tag] = float(subitem.text) else: articles[subitem.tag] = subitem.text for i, article in enumerate(list): if len(article) == 0: print "Empty value found" else: print es.index('articles3', 'article', article, id=i)
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list( client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d
"description": crawled_data.get("description"), "logo": crawled_data.get("logo"), "twitter": crawled_data.get("twitter"), "station_site": crawled_data.get("station_site"), "primary_genre": crawled_data.get("primary_genre"), "frequency": crawled_data.get("frequency"), "shoutcast_url": crawled_data.get("shoutcast_url"), } # TODO: get lat, lon if hasattr(settings, 'GEONAMES_USER') and settings.GEONAMES_USER != "demo": params = { "name_equals": index_data["city"], "country": index_data["country"], "adminCode1": index_data["state"], "maxRows": 10, "lang": "en", "username": settings.GEONAMES_USER, "style": "medium" } geo_request = requests.get("http://api.geonames.org/searchJSON", params=params) geonames = geo_request.json().get("geonames", []) if geonames: index_data["location"] = { "lat": float(geonames[0]["lat"]), "lon": float(geonames[0]["lng"]) } es.index(INDEX_NAME, 'station', index_data, id=crawled_data.get('id')) print("Bailed after %d failures (pk %d)" % (failures, pk))
def get_netranges(starting_ip='1.0.0.0', last_ip='2.0.0.0', elastic_search_url='http://127.0.0.1:9200/', index_name='netblocks', doc_name='netblock', sleep_min=1, sleep_max=5): connection = ElasticSearch(elastic_search_url) current_ip = starting_ip while True: # See if we've finished the range of work if ip2long(current_ip) > ip2long(last_ip): return current_ip = get_next_undefined_address(current_ip) if current_ip == None: # No more undefined ip addresses return print current_ip try: whois_resp = IPWhois(current_ip).lookup_rws() except Exception as error: """ If a message like: 'STDERR: getaddrinfo(whois.apnic.net): Name or service not known' appears' then print it out and try the next IP address. """ print type(error), error current_ip = get_next_ip(current_ip) if current_ip is None: return # No more undefined ip addresses gevent.sleep(randint(sleep_min, sleep_max)) continue if 'asn_cidr' in whois_resp and \ whois_resp['asn_cidr'] is not None and \ whois_resp['asn_cidr'].count('.') == 3: last_netrange_ip = get_netrange_end(whois_resp['asn_cidr']) else: try: last_netrange_ip = \ whois_resp['nets'][0]['range'].split('-')[-1].strip() assert last_netrange_ip.count('.') == 3 except: # No match found for n + 192.0.1.0. print 'Missing ASN CIDR in whois resp: %s' % whois_resp current_ip = get_next_ip(current_ip) if current_ip is None: return # No more undefined ip addresses gevent.sleep(randint(sleep_min, sleep_max)) continue assert last_netrange_ip is not None and \ last_netrange_ip.count('.') == 3, \ 'Unable to find last netrange ip for %s: %s' % (current_ip, whois_resp) # Save current_ip and whois_resp entry = { 'netblock_start': current_ip, 'netblock_end': last_netrange_ip, 'block_size': ip2long(last_netrange_ip) - ip2long(current_ip) + 1, 'whois': json.dumps(whois_resp), } keys = ('cidr', 'name', 'handle', 'range', 'description', 'country', 'state', 'city', 'address', 'postal_code', 'abuse_emails', 'tech_emails', 'misc_emails', 'created', 'updated') for _key in keys: entry[_key] = str(whois_resp['nets'][0][_key]) \ if _key in whois_resp['nets'][0] and \ whois_resp['nets'][0][_key] else None if _key == 'city' and entry[_key] and ' ' in entry[_key]: entry[_key] = entry[_key].replace(' ', '_') try: connection.index(index_name, doc_name, entry) except ElasticHttpError, error: print 'At %s. Unable to save record: %s' % (current_ip, entry) raise error current_ip = get_next_ip(last_netrange_ip) if current_ip is None: return # No more undefined ip addresses gevent.sleep(randint(sleep_min, sleep_max))
class ItvacaturesParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "it-vacatures" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseTitel(self, soup): titel = soup.head.title.string return titel def parseWerkgever(self, soup): info = soup.find("td") infoTwee = info.find_next_sibling() p = re.compile(r'<.*?>') werkgever = p.sub('', str(infoTwee)) return werkgever def parseLocatie(self, soup): info = soup.find("td") infoTwee = info.find_next_sibling() locatieEen = infoTwee.find_next() p = re.compile(r'<.*?>') locatieTwee = p.sub('', str(locatieEen)) p = re.compile(r'Locatie') locatie = p.sub('', str(locatieTwee)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find("div", {"id": "job-description"}) p = re.compile(r'<.*?>') inhoud = p.sub('', str(body)) return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) # parsen titel = self.parseTitel(soup) try: werkgever = self.parseWerkgever(soup) except: werkgever = "-" try: locatie = self.parseLocatie(soup) except: locatie = "-" inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id (string) id = self.website + "-" + re.sub(r'\W+', '', titel) # make document to be send to elasticsearch database document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print "Es: " + titel