def __init__(self,
                 hosts=None,
                 index="metrics",
                 doc_type="metric",
                 index_pattern="{index}-{date:%Y.%m.%d}",
                 *args,
                 **kwargs):
        # Assign these in the backend as they are needed when writing metrics
        # to elasticsearch
        self.index = index
        self.doc_type = doc_type
        self.index_pattern = index_pattern

        # setup the client
        self.client = Elasticsearch(hosts=hosts, *args, **kwargs)

        # ensure the index is created
        try:
            self._setup_index()
        except TransportError as exc:
            logger.error('index setup error %r', exc)
        try:
            self._setup_mapping()
        except TransportError as exc:
            logger.error('mapping setup error %r', exc)
Esempio n. 2
0
 def __init__(self, ES):
     self._add = ES
     #创建es客户端
     self.es = Elasticsearch(
         self._add,
         # 启动前嗅探es集群服务器
         sniff_on_start=True,
         # es集群服务器结点连接异常时是否刷新es节点信息
         sniff_on_connection_fail=True,
         # 每60秒刷新节点信息
         sniffer_timeout=60)
Esempio n. 3
0
def main_method_to_loop():
    global nodes_stats_before
    global nodes_stats_after

    get_node_names()

    # Get get Initial stats before sleep
    es = Elasticsearch(es_url,
                       verify_certs=False,
                       connection_class=RequestsHttpConnection)
    print("[INFO] [" + datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "]: Fetch Node BEFORE Stats\n")
    nodes_stats_before = es.nodes.stats()

    # Do Loop for 5 times with 2sec sleep and update the cluster stats
    for i in range(1, 5):
        print("[INFO] [" + datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
              "]: Fetch Cluster Stats")
        get_cluster_stats()
        time.sleep(2)

    # Get the after stats. This will be after some time
    print("[INFO] [" + datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "]: Fetch Node AFTER Stats\n")
    nodes_stats_after = es.nodes.stats()

    # Now using before and after node stats, Calculate ES Metrics
    measure_es_metrics()
def main(args: argparse.ArgumentParser(), logger: logging.getLogger()) -> list:
    """
    Setups Client, Makes the call at Elasticsearch with the query
    and returns a sorted list of dicitonaries 

    Parameters
    ----------
    args: argparse.ArgumentParser
        The command line arguments given
    logger: logging.getLogger
        logger on DEBUG level

    Returns
    -------
    ordered list of dictionaries {'questionId: ..., 'score: }
    """

    # setup Client
    logger.info("# setup Client")
    client = Elasticsearch()

    # setup query
    logger.info("# setup query")
    query_request = setup_query(args)

    # make request
    logger.info("# make request")
    bm25_scores = make_request(query_request, client)

    return bm25_scores
def index_kb(all_docs_kb: dict, client: Elasticsearch(), args: argparse.ArgumentParser(), logger: logging.getLogger()):
    """
    Indexes the KB
    
    Parameters
    ----------
    args: argparse.ArgumentParser
        The command line arguments given
    logger: logging.getLogger
        logger on DEBUG level

    Returns
    -------
    None
    """

    # Delete Index
    client.indices.delete(index=INDEX_NAME, ignore=[404])
    # Create Index
    with open(MAPPING_JSON_FILE) as mapping_json_file:
        source = mapping_json_file.read().strip()
        client.indices.create(index=INDEX_NAME, body=source)

    # upload the KB
    logger.info("# upload the KB")
    bulk(client, all_docs_kb)
    return
def make_request(query_request: dict, client: Elasticsearch()) -> list:
    """
    Makes a Call to an ES-Client with the query_request
    and returns a sorted list of dicitonaries containing the questionId and Score

    Parameters
    ----------
    query_request: dict
        dictionary containing the request to make at the client
    client: Elasticsearch()
        Elasticsearch client
    Returns
    -------
    ordered list of dictionaries {'questionId: ..., 'score: }
        
    """

    # make query request
    response = client.search(
        index=INDEX_NAME,  # index names to search
        body=query_request)

    # filter out questionId and score
    hits = response["hits"]["hits"]
    bm25_scores = [{
        'questionId': hit["_source"]["questionId"],
        'score': hit["_score"]
    } for hit in hits]

    return bm25_scores
Esempio n. 7
0
def create_app(config_class=Config):
    app = Flask(__name__)
    app.config.from_object(config_class)

    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)
    mail.init_app(app)
    bootstrap.init_app(app)
    moment.init_app(app)
    babel.init_app(app)

    app.elasticsearch = Elasticsearch([app.config['ELASTICSEARCH_URL']]) \
        if app.config['ELASTICSEARCH_URL'] else None

    from app.errors import bp as errors_bp
    app.register_blueprint(errors_bp)

    from app.auth import bp as auth_bp
    app.register_blueprint(auth_bp, url_prefix='/auth')

    from app.main import bp as main_bp
    app.register_blueprint(main_bp)

    if not app.debug and not app.testing:
        if app.config['MAIL_SERVER']:
            auth = None
            if app.config['MAIL_USERNAME'] or app.config['MAIL_PASSWORD']:
                auth = (app.config['MAIL_USERNAME'],
                        app.config['MAIL_PASSWORD'])
            secure = None
            if app.config['MAIL_USE_TLS']:
                secure = ()
            mail_handler = SMTPHandler(
                mailhost=(app.config['MAIL_SERVER'], app.config['MAIL_PORT']),
                fromaddr='no-reply@' + app.config['MAIL_SERVER'],
                toaddrs=app.config['ADMINS'],
                subject='Microblog Failure',
                credentials=auth,
                secure=secure)
            mail_handler.setLevel(logging.ERROR)
            app.logger.addHandler(mail_handler)

        if not os.path.exists('logs'):
            os.mkdir('logs')
        file_handler = RotatingFileHandler('logs/microblog.log',
                                           maxBytes=10240,
                                           backupCount=10)
        file_handler.setFormatter(
            logging.Formatter('%(asctime)s %(levelname)s: %(message)s '
                              '[in %(pathname)s:%(lineno)d]'))
        file_handler.setLevel(logging.INFO)
        app.logger.addHandler(file_handler)

        app.logger.setLevel(logging.INFO)
        app.logger.info('Microblog startup')

    return app
Esempio n. 8
0
def es_reporter():
    es = Elasticsearch([{"host": config.es_host, "port": config.es_port}])
    if not es.ping():
        logger.error(
            "Cannot connect to Elasticsearch server. Host: %s, Port: %s",
            config.es_host,
            config.es_port,
        )

    def on_message(message: Message):
        logger.debug("From queue: %s, received: %s", config.amqp_queue,
                     message.body)
        try:
            validate_payload(message.body)
        except ValueError as err:
            logger.error("%s Message: %s", err, message.body)
            message.reject()
            logger.warning("Rejected: %s", message.body)
            return
        try:
            es.index(index=config.es_index,
                     body=message.body,
                     doc_type=config.es_doc_type)
        except ElasticsearchException as err:
            logger.error("Cannot modify index %s. Error: %s", config.es_index,
                         err)
            message.reject()
            logger.warning("Rejected: %s", message.body)
            return
        logger.debug(
            "To index: %s, type: %s, document: %s",
            config.es_index,
            config.es_doc_type,
            message.body,
        )
        message.ack()
        logger.debug("Acked: %s", message.body)

    try:
        yield on_message
    finally:
        try:
            es.close()
        except AttributeError:
            pass
Esempio n. 9
0
    def test_chunk_sent_from_different_threads(self, _process_bulk_chunk):
        actions = ({'x': i} for i in range(100))
        results = list(
            helpers.parallel_bulk(Elasticsearch(),
                                  actions,
                                  thread_count=10,
                                  chunk_size=2))

        self.assertTrue(len(set([r[1] for r in results])) > 1)
Esempio n. 10
0
def main(args: argparse.ArgumentParser(), logger: logging.getLogger()):
    """asdf und nochmal asdf"""

    # load_live_model
    (
        model,
        scaler,
        all_docs_kb,
        data_kb_with_vectors,
        qid_to_class,
        class_to_qid,
        without_stopwords,
        num_of_sentences,
    ) = load_model(args.KB_id)

    # set args parameters
    args.without_stopwords = without_stopwords
    args.num_of_sentences = num_of_sentences
    
    # load use and bm25-elasticsearch
    logger.info("# load use and bm25-elasticsearch")
    embed = initialize_use_model()
    client = Elasticsearch()

    # execute main index_elasticsearch
    logger.info("# execute main index_elasticsearch")
    index_kb(all_docs_kb, client, args, logger)

    
    while True:
        query = input("enter your query or q: ")
        if query == "q":
            print("Exiting")
            break
        args.query = query

        start = time.time()
        use_response = make_query_use(data_kb_with_vectors, embed, args, logger)
        end = time.time()
        print(end - start)
        
        bm25_response = make_query_bm25(client, args)


        response_vector_level_1 = encode_responses(bm25_response, use_response, len(model.classes_), qid_to_class).reshape(1, -1)

        scaler.transform(response_vector_level_1)
        response_vector_level_2 = model.predict_log_proba(response_vector_level_1)


        response_vector_level_2 = np.argsort(response_vector_level_2)[::-1]
        response = [class_to_qid[pos] for pos in response_vector_level_2[0]]


        
    return
Esempio n. 11
0
    def __init__(self, host, port, index_param, field):

        self.field = field
        self.index = index_param

        self.es = Elasticsearch([host + ':' + str(port)])

        if not self.es.indices.exists(index=index_param):
            self.es.indices.create(index=index_param, body={
                "mappings": {
                    "_doc": {
                        "properties": {
                            field: {
                                "type": "geo_point"
                            }
                        }
                    }
                }
            })
Esempio n. 12
0
def query_data(index, dls):

    hosts = ['192.168.5.11', '192.168.5.12', '192.168.5.14']
    es = Elasticsearch(hosts=hosts, port=9200)

    res = es.search(index=index,
                    scroll='1m',
                    timeout='3s',
                    size=1000,
                    body=dls)
    mdata = res.get("hits").get("hits")
    if not mdata:
        print('empty!')
    scroll_id = res["_scroll_id"]
    total = res["hits"]["total"]
    for i in range(int(total / 1000)):
        res_scroll = es.scroll(scroll_id=scroll_id, scroll='1m')
        mdata += res_scroll["hits"]["hits"]
    result = mdata
    return result
Esempio n. 13
0
def initialize_elasticsearch():
    n = 0  #initialize counter "n"
    while n <= 10: 
        try:
            es = Elasticsearch(ES_SERVER) 
            print("Initializing Elasticsearch...")
            return es
        except elasticsearch6.exceptions.ConnectionTimeout as err: 
            print(err)
            n += 1 
            continue
    raise Exception
def elastic_search():
    wait_for(f"http://{config.es_host}:{config.es_port}/")
    es = Elasticsearch([{"host": config.es_host, "port": config.es_port}])

    def search(body: dict) -> None:
        es.indices.flush()
        es.indices.refresh()
        return es.search(index="test", body=body)

    try:
        yield search
    finally:
        if es.indices.exists("test"):
            es.indices.delete("test")
Esempio n. 15
0
def main():
	parser = argparse.ArgumentParser(
		prog="es-rebalance",
		formatter_class=argparse.RawDescriptionHelpFormatter,
		description="""\
Disk-usage-based rebalancing tool for ElasticSearch.

Swaps large shards with small shards to balance disk usage between hosts.

Starts one set of swaps per run. Swaps will happen asynchronously after running this script. Multiple runs may be needed to fully balance.

This script is rack aware; it will not swap shards so that a primary and replica for the same index are on the same rack.

Example:

	es-rebalance -u localhost:9200 --box-type hot --iterations 50
""")
	parser.add_argument("-u", "--url", required=True, action="append",
		help="URL to cluster. Can be specified multiple times for redundancy.")
	parser.add_argument("-b", "--box-type", required=True,
		help="Box type of nodes to rebalance. One of 'warm' or 'hot'.")
	parser.add_argument("-i", "--iterations", type=int, default=10,
		help="Maximum number of shards to exchange.")
	parser.add_argument("-p", "--shard-percentage", type=float, default=90,
		help="Don't exchange shards whose sizes are within this percent of each other, to avoid swapping similar-sized shards.")
	parser.add_argument("-P", "--node-percentage", type=float, default=10,
		help="Don't exchange between nodes whose sizes are within this many percentage points of each other.")
	parser.add_argument("-v", "--verbose", action="store_true",
		help="Print debug logs.")
	parser.add_argument("--execute", action="store_true",
		help="Run the plan. If not specified, will be a dry run.")
	
	args = parser.parse_args()
	
	logging.basicConfig(level=logging.INFO)
	if args.verbose:
		LOG.setLevel(logging.DEBUG)
	
	es = Elasticsearch(args.url)
	
	plan = Plan(es, args.box_type, args.shard_percentage, args.node_percentage)
	for i in range(args.iterations):
		if not plan.plan_step():
			LOG.warn("Could not move anything, stopping early after %d iteration(s)", i+1)
			break
	
	plan.exec(dry_run=not args.execute)
	if not args.execute:
		LOG.warn("Finished dry run. Use `--execute` to run for real.")
Esempio n. 16
0
class esdata(object):

    #ES = ['127.0.0.1:9200']
    # 查找具体数据
    query = {
        "query": {
            "bool": {
                "must": [{
                    "match": {
                        "name": 'a'
                    }
                }]
            }
        },
        "size": 100
    }

    def __init__(self, ES):
        self._add = ES
        #创建es客户端
        self.es = Elasticsearch(
            self._add,
            # 启动前嗅探es集群服务器
            sniff_on_start=True,
            # es集群服务器结点连接异常时是否刷新es节点信息
            sniff_on_connection_fail=True,
            # 每60秒刷新节点信息
            sniffer_timeout=60)

    def saveData(self, list):
        doc = self.createdoc(list)
        t = self.es.bulk(index="index1", doc_type='type1', body=doc)

        print('insert es successfull ?', t)

        # r=self.es.search(index="index1",doc_type='type1',body=self.query)
        # print('es result is:',r)

# 转变数据模型

    def createdoc(self, list):
        doc = []
        for dup in list:
            doc.append(dict(index={}))
            doc.append(dict(zip(['id', 'name', 'age'], dup)))

        print('doc is:', doc)
        return doc
Esempio n. 17
0
def get_test_client(nowait=False, **kwargs):
    # construct kwargs from the environment
    kw = {'timeout': 30}
    if 'TEST_ES_CONNECTION' in os.environ:
        from elasticsearch6 import connection
        kw['connection_class'] = getattr(connection,
                                         os.environ['TEST_ES_CONNECTION'])

    kw.update(kwargs)
    client = Elasticsearch([os.environ.get('TEST_ES_SERVER', {})], **kw)

    # wait for yellow status
    for _ in range(1 if nowait else 100):
        try:
            client.cluster.health(wait_for_status='yellow')
            return client
        except ConnectionError:
            time.sleep(.1)
    else:
        # timeout
        raise SkipTest("Elasticsearch failed to start.")
Esempio n. 18
0
import json
import re
from datetime import datetime
from elasticsearch6 import Elasticsearch

if __name__ == '__main__':
    elasticsearch = Elasticsearch(hosts=['192.168.1.68:9200'])
    elasticsearch.indices.create(index='imdb', ignore=400)
    with open('movies.jl', 'r') as file:
        lines = file.readlines()
        for line in lines:
            data = json.loads(line)
            casts = [
                f'{key.replace("-", "").strip()}-{val.replace("-", "").strip()}'
                for key, val in data['cast'].items()
            ]
            try:
                release_date, place = re.findall(r'(.+)\s\((.+)\)',
                                                 data['release_date'])[0]
                data['release_date'] = str(
                    datetime.strptime(release_date,
                                      '%d %B %Y').replace(day=1).date())
            except ValueError:
                try:
                    data['release_date'] = str(
                        datetime.strptime(release_date, '%B %Y').date())
                except ValueError:
                    data['release_date'] = str(
                        datetime.strptime(release_date, '%Y').date())
            except KeyError:
                place = 'Unknown'
Esempio n. 19
0
from datetime import datetime
from elasticsearch6 import Elasticsearch
es = Elasticsearch()

doc = {'id': 0, 'name': 'foo'}
ID = 1
INDEX = "foo_index"
DOC_TYPE = "foo_type"
res = es.delete(index=INDEX, doc_type=DOC_TYPE, id=ID)
es.create(index=INDEX, doc_type=DOC_TYPE, id=ID, body=doc)

# res = es.delete(index=INDEX, doc_type=DOC_TYPE, id=2)
res = es.get(index=INDEX, doc_type=DOC_TYPE, id=ID)

print(res)

# res = es.get(index=InterruptedError, id=1, doc_type=DOC_TYPE)
# print(res['_source'])
Esempio n. 20
0
class ElasticsearchBackend(BaseMetricsBackend):
    def __init__(self,
                 hosts=None,
                 index="metrics",
                 doc_type="metric",
                 index_pattern="{index}-{date:%Y.%m.%d}",
                 *args,
                 **kwargs):
        # Assign these in the backend as they are needed when writing metrics
        # to elasticsearch
        self.index = index
        self.doc_type = doc_type
        self.index_pattern = index_pattern

        # setup the client
        self.client = Elasticsearch(hosts=hosts, *args, **kwargs)

        # ensure the index is created
        try:
            self._setup_index()
        except TransportError as exc:
            logger.error('index setup error %r', exc)
        try:
            self._setup_mapping()
        except TransportError as exc:
            logger.error('mapping setup error %r', exc)

    def get_index(self):
        return self.index_pattern.format(index=self.index, date=datetime.now())

    def _setup_index(self):
        return self.client.indices.create(self.get_index(), ignore=400)

    def _setup_mapping(self):
        return self.client.indices.put_template(
            name="timeexecution-{}".format(self.index),
            body={
                "template": "{}*".format(self.index),
                "mappings": {
                    self.doc_type: {
                        "dynamic_templates": [{
                            "strings": {
                                "mapping": {
                                    "type": "keyword"
                                },
                                "match_mapping_type": "string"
                            }
                        }],
                        "_source": {
                            "enabled": True
                        },
                        "properties": {
                            "name": {
                                "type": "keyword"
                            },
                            "timestamp": {
                                "type": "date"
                            },
                            "hostname": {
                                "type": "keyword"
                            },
                            "value": {
                                "type": "float"
                            },
                            "origin": {
                                "type": "keyword"
                            },
                        }
                    },
                },
                "settings": {
                    "number_of_shards": "1",
                    "number_of_replicas": "1",
                },
            })

    def write(self, name, **data):
        """
        Write the metric to elasticsearch

        Args:
            name (str): The name of the metric to write
            data (dict): Additional data to store with the metric
        """

        data["name"] = name
        if not ("timestamp" in data):
            data["timestamp"] = datetime.utcnow()

        try:
            self.client.index(index=self.get_index(),
                              doc_type=self.doc_type,
                              id=None,
                              body=data)
        except TransportError as exc:
            logger.warning('writing metric %r failure %r', data, exc)

    def bulk_write(self, metrics):
        """
        Write multiple metrics to elasticsearch in one request

        Args:
            metrics (list): data with mappings to send to elasticsearch
        """
        actions = []
        index = self.get_index()
        for metric in metrics:
            actions.append(
                {'index': {
                    '_index': index,
                    '_type': self.doc_type
                }})
            actions.append(metric)
        try:
            self.client.bulk(actions)
        except TransportError as exc:
            logger.warning('bulk_write metrics %r failure %r', metrics, exc)
Esempio n. 21
0
from elasticsearch6 import Elasticsearch, helpers
import json
import pprint
from io import StringIO

# ES IP주소와 포트(9200)로 연결한다.
es = Elasticsearch("localhost:9200")


def insertData():
    index = "seoul-metro-logs-2019"

    with open(
            '/home/ec2-user/nclab/elastic-demos/seoul-metro-logs/data/seoul-metro-2019.logs',
            'r') as logfile:
        for line in logfile:
            line = StringIO(line)
            doc = json.load(line)
            es.index(index="seoul-metro-logs-2019", doc_type="_doc", body=doc)


insertData()
Esempio n. 22
0
#!/usr/bin/env python

from elasticsearch6 import Elasticsearch
from socket import gethostname
from time import sleep
import json
import pprint
import salt.client

whoami = gethostname()
alert = salt.client.Caller()
pp = pprint.PrettyPrinter(indent=4)
es = Elasticsearch([{'host':'localhost','port':9200}])

S = [ "device", "rule_name", "action", "application", "severity", "threat_type", "rule_name", "rule_name", "src_ip", "dst_ip", "dst_port" ]

Q = {
    "query": {
        "bool": {
          "must": [
            { "terms": {"action": ["deny", "drop", "reset-client", "reset-server", "reset-both", "block-url", "block-ip", "random-drop", "sinkhole", "block"]} },
            { "range": {
              "@timestamp": {
                "gte": "now-1h",
                "lte": "now"
              }
            }}
          ]
        }
    }
}
Esempio n. 23
0
from datetime import datetime
from elasticsearch6 import Elasticsearch

es = Elasticsearch()

# create an index in elasticsearch, ignore status code 400 (index already exists)
es.indices.create(index='my-index', ignore=400)

es.index(index="my-index",
         id=42,
         body={
             "any": "data",
             "timestamp": datetime.now()
         })
Esempio n. 24
0
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

from elasticsearch6 import Elasticsearch
es_host = Elasticsearch(
                        ["192.168.0.253:9200", "192.168.0.252:9200"],
                        #开启嗅探
                        sniff_on_start=True,
                        # 节点没有响应时,进行刷新,重新连接
                        sniff_on_connection_fail=True,
                        # 每 60 秒刷新一次
                        sniffer_timeout=60,
                        #嗅探请求10s
                        sniff_timeout=10)
Esempio n. 25
0
        help=
        "The elasticsearch host you wish to connect too. (Default: localhost:9200)"
    )
    parser.add_argument(
        "-p",
        "--path",
        action="store",
        default=None,
        help=
        "Path to git repo. Commits used as data to load into Elasticsearch. (Default: None"
    )

    args = parser.parse_args()

    # instantiate es client, connects to localhost:9200 by default
    es = Elasticsearch(args.host)

    # we load the repo and all commits
    load_repo(es, path=args.path)

    # run the bulk operations
    success, _ = bulk(es, UPDATES, index='git')
    print('Performed %d actions' % success)

    # we can now make docs visible for searching
    es.indices.refresh(index='git')

    # now we can retrieve the documents
    initial_commit = es.get(index='git',
                            doc_type='doc',
                            id='20fbba1230cabbc0f4644f917c6c2be52b8a63e8')
Esempio n. 26
0
        # get created date for a repo and fallback to authored_date for a commit
        created_at = parse_date(hit['_source'].get('created_at', hit['_source']['authored_date']))
        print('/%s/%s/%s (%s): %s' % (
                hit['_index'], hit['_type'], hit['_id'],
                created_at.strftime('%Y-%m-%d'),
                hit['_source']['description'].split('\n')[0]))

    print('=' * 80)
    print()

# get trace logger and set level
tracer = logging.getLogger('elasticsearch.trace')
tracer.setLevel(logging.INFO)
tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
# instantiate es client, connects to localhost:9200 by default
es = Elasticsearch()

print('Empty search:')
print_hits(es.search(index='git'))

print('Find commits that says "fix" without touching tests:')
result = es.search(
    index='git',
    doc_type='doc',
    body={
      'query': {
        'bool': {
          'must': {
            'match': {'description': 'fix'}
          },
          'must_not': {
Esempio n. 27
0
File: es.py Progetto: suyanan/docker
def main():
    a_time = time.time()

    args = get_args()
    config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               "config.ini")
    #config_file = "/sfs-grand-med-research/home/suyanan/scripts/SV/dbSV/dbsv-es-demos/config.ini"

    #input_file = os.path.join("/sfs-grand-med-research/home/suyanan/research/elasticsearch/demo_data","DBM19A3676-1.af_0.3.minre_0.3.maxre_2.0.filter.bed") #19578SV(814)
    #input_file = os.path.join("/sfs-grand-med-research/home/suyanan/research/elasticsearch/demo_data","demo.bed")
    #input_file = os.path.join("/sfs-grand-med-research/home/suyanan/research/elasticsearch/demo_data","demo.tra.bed")
    #input_file = os.path.join("/sfs-grand-med-research/home/suyanan/research/elasticsearch/demo_data","DBM19A3676-1.re2.filter.bed")     #40746SV(7138TRA)
    input_file = args.input

    ##config paras
    config = configparser.ConfigParser()
    config.read(config_file)

    #es info
    es = config['es']
    es_address = es['address']
    client = Elasticsearch(es_address)

    #index mysv
    es_mysv = config['es-mysv']
    es_mysv_index = es_mysv['index']
    es_mysv_doc_type = es_mysv['doc_type']
    MYSV_SIZE = int(es_mysv["SIZE"])
    #index mysample
    es_sample = config['es-sample']
    es_sample_index = es_sample['index']
    es_sample_doc_type = es_sample['doc_type']
    SAMPLE_SIZE = es_sample['SIZE']

    #constant vars
    filter = config['filter']
    MIN_RE = int(filter['MIN_RE'])
    BREAKPOINT = int(filter['BREAKPOINT'])
    OVERLAP = float(filter['OVERLAP'])
    SVTYPE_REGION_LIST = filter['SVTYPE_REGION_LIST']
    nonStandardSVtype_list = filter['nonStandardSVtype']

    #bed culomn of query svs
    query_dict = config['query']  #chrom_col,start_col,info_col

    ##dbSV样本个数和健康人样本个数的查询,一次性的
    sample_num, samples_list, healthy_db_num, healthy_db_list = get_sample_data(
        client, es_sample_index, es_sample_doc_type, SAMPLE_SIZE)
    print("there are {} samples in dbsv, including {} healthy samples.".format(
        sample_num, healthy_db_num))

    ##SV batch query from ES dbSV
    target_svs_list = get_batch_svs(input_file, query_dict)
    #stat_file = os.path.splitext(input_file)[0]+".dbsv.txt"
    stat_file = os.path.join(
        args.outdir,
        os.path.basename(input_file).strip("bed") + "dbsv.txt")
    with open(stat_file, "w") as out_io:
        out_list = []
        for target_sv in target_svs_list:
            starter = time.time()
            target_chrom = target_sv["target_chrom"]
            target_start = target_sv["target_start"]
            target_svid = target_sv["target_svid"]

            target_end = target_sv["target_end"]
            target_svtype = target_sv[
                "target_svtype"]  #5svs and nonStandardSVtype_list
            target_svlen = target_sv["target_svlen"]
            target_chr2 = target_sv["target_chr2"]

            ##1st query and 2nd filter, filter_sample_list is sample:svid
            filter_num, filter_sample_list = get_sv_freq(
                client, es_mysv_index, es_mysv_doc_type, MIN_RE, BREAKPOINT,
                OVERLAP, SVTYPE_REGION_LIST, MYSV_SIZE, target_chrom,
                target_chr2, target_svtype, target_start, target_end,
                target_svlen, nonStandardSVtype_list)

            ##频率:基于dbSV所有样本
            dbsv_freq = filter_num / sample_num
            #print("THE dbsv_freq is {}".format(dbsv_freq))
            #print(filter_sample_list)
            #print(len(filter_sample_list))

            ##频率:基于dbSV中正常人样本
            healthy_sample_list = []
            healthy_num = 0
            for sample in filter_sample_list:
                #if sample in healthy_db_list:
                if sample.split(":")[0] in healthy_db_list:
                    healthy_num += 1
                    healthy_sample_list.append(
                        sample)  #get healthy samples from filter_sample_list
            dbsv_healthy_freq = healthy_num / healthy_db_num
            #print("The dbsv_healthy_freq is {}".format(dbsv_healthy_freq))
            ender = time.time()
            timer = ender - starter
            #print(healthy_sample_list)
            #print(len(healthy_sample_list))

            #out_str = "\t".join([str(dbsv_freq),str(dbsv_healthy_freq),str(timer)])+"\n"
            out_str = ""
            if len(filter_sample_list) != 0:
                out_str = "\t".join([
                    target_svid,
                    str(dbsv_healthy_freq),
                    str(dbsv_freq), ";".join(filter_sample_list)
                ]) + "\n"
            else:
                out_str = "\t".join(
                    [target_svid,
                     str(dbsv_healthy_freq),
                     str(dbsv_freq), "."]) + "\n"
            out_list.append(out_str)

        out_io.write("\t".join([
            "#SVID", "GrandSV_Healthy_Frequency", "GrandSV_Frequency",
            "GrandSV_LIST"
        ]) + "\n")
        out_io.writelines(out_list)

    b_time = time.time()
    total_time = b_time - a_time
Esempio n. 28
0
from app import celery, create_app
from elasticsearch6 import Elasticsearch

app = create_app()

es = Elasticsearch(
    [app.config.get('ES_HOST')],
    http_auth=(app.config.get('ES_USERNAME'), app.config.get('ES_PASSWORD')),
    port=app.config.get('ES_PORT'),
)

app.app_context().push()
Esempio n. 29
0
 def setUp(self):
     super(ElasticsearchTestCase, self).setUp()
     self.client = Elasticsearch(transport_class=DummyTransport)
Esempio n. 30
0
from datetime import datetime
from elasticsearch6 import Elasticsearch
es = Elasticsearch({"172.16.100.186": "9200"})

doc = {'id': 2, 'name': "乙"}
ID = 2
INDEX = "wuren"
DOC_TYPE = "cn"
# res = es.delete(index=INDEX, doc_type=DOC_TYPE, id=ID)
es.create(index=INDEX, doc_type=DOC_TYPE, id=ID, body=doc)

# res = es.delete(index=INDEX, doc_type=DOC_TYPE, id=2)
res = es.get(index=INDEX, doc_type=DOC_TYPE, id=ID)

print(res)

# res = es.get(index=InterruptedError, id=1, doc_type=DOC_TYPE)
# print(res['_source'])