예제 #1
0
def simple_query(page):
    d = dict()
    query = buildQuery()
    print(query)
    solr = SolrClient(current_app.config['SOLR'])
    res = solr.query('scripties', {
        'q': query,
        'rows': '0',
    })
    count = res.get_num_found()
    pages = math.ceil(count / 10)
    start = (page - 1) * 10
    res = solr.query(
        'scripties', {
            'q': query,
            'rows': '10',
            'start': start,
            'fl': 'id,titel,auteur,jaar',
            'facet': True,
            'facet.field': ['jaar', 'type', 'faculteit'],
        })
    facets = res.get_facets()
    d['result'] = res
    d['pages'] = pages
    d['page'] = page
    d['f_jaar'] = facets['jaar']
    d['f_type'] = facets['type']
    d['f_faculteit'] = collect(facets['faculteit'])
    d['f'] = request.args.get('faculteit')
    d['j'] = request.args.get('jaar')
    d['t'] = request.args.get('type')
    return d
예제 #2
0
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass

        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)
예제 #3
0
def update_ml_tag(solr: SolrClient, tweets_core_name, tags_core_name, docs,
                  feat_vectorizer, ml_model, selected_features,
                  hate_indicative_features, scaling_option, sysout, logger):
    tweets = []
    for d in docs:
        text = d['status_text']
        if "rt @" in text.lower():
            start = text.lower().index("rt @") + 4
            text = text[start].strip()

        tweets.append(text)

    #ml classify, also compute risk scores
    logger.info("begin ml classification for tweets={}, time={}".format(
        len(tweets), datetime.datetime.now()))
    tags, risk_scores = ml_tag(tweets, feat_vectorizer, ml_model,
                               selected_features, hate_indicative_features,
                               scaling_option, sysout, logger, solr,
                               tags_core_name)

    logger.info("ml classification done. updating solr index...{}".format(
        datetime.datetime.now()))

    count = 0
    for idx, tag in enumerate(tags):
        if tag == 0:
            count += 1
            #print(d['status_text'])
        d = docs[idx]
        d['ml_tag'] = str(tag)
        d['tweet_risk'] = risk_scores[idx]

    print(count)
    solr.index(tweets_core_name, docs)
    code = iu.commit(tweets_core_name)
예제 #4
0
    def calculate(term, all_candidates, solr_core_url):
        solrClient = SolrClient(solr_core_url)

        longer_terms = CValueRanker.get_longer_terms(term, all_candidates)
        term_freq_dict, normed_term_dict = solrClient.totaltermfreq(
            FIELD_CONTENT, {term})

        term_freq = list(term_freq_dict.values())[0]

        # print("term freq of '",term,"': ", term_freq)

        num_of_term_words = len(TermUtil.normalise(term).split(' '))
        # print("num of term words:", num_of_term_words)

        log2a = math.log(num_of_term_words, 2)
        # print("log2a:", log2a)

        if longer_terms:
            p_ta = len(longer_terms)
            # print("p_ta:", p_ta)
            sum_fb = TermRanker.sum_ttf_candidates(solrClient, longer_terms)
            # print("sum_fb:", sum_fb)
            term_cValue = log2a * (term_freq - (1 / p_ta) * sum_fb)
        else:
            term_cValue = log2a * term_freq

        return (term, term_cValue)
예제 #5
0
def update_pmi_scores(existing_tags: dict,
                      existing_tag_pairs: dict,
                      solr: SolrClient, core_name, batch_commit):
    count = 0
    batch = []
    for tag_pair, data in existing_tag_pairs.items():
        count += 1
        if count > batch_commit:
            solr.index(core_name, batch)
            code = util.commit(core_name)
            count = 0
            batch = []
            logger.info("\t done batch size={}".format(batch_commit))

        co_freq = data[util.tag_index_field_frequency]
        tags = tag_pair.split(" ")
        t1_freq = existing_tags[tags[0]][util.tag_index_field_frequency]
        t2_freq = existing_tags[tags[1]][util.tag_index_field_frequency]

        if co_freq==0:
            pmi=0
        else:
            pmi = numpy.emath.log(co_freq / (t1_freq * t2_freq + util.score_denominator_min))
        data[util.tag_index_field_pmi] = pmi
        data[util.tag_index_field_text] =tag_pair
        data[util.tag_index_field_type] =1
        batch.append(data)

    # commit the rest
    solr.index(core_name, batch)
    code = util.commit(core_name)
class TwitterSearch():
    __solr = None
    __core = None
    __api = None

    def __init__(self, oauth):
        super().__init__()
        self.__solr = SolrClient(iu.solr_url)
        self.__core = iu.solr_core_tweets
        self.__api = tweepy.API(oauth)

    def index(self, keywords):
        for keyword in keywords:
            count = 0
            for status in tweepy.Cursor(self.__api.search,
                                        q=keyword,
                                        tweet_mode="extended",
                                        lang="en").items(500):
                count += 1
                # created_at_time
                str_created_at = status.created_at
                str_solr_time = str_created_at.utcnow().strftime(
                    SOLR_TIME_PATTERN)
                docs = [{
                    'id': status.id,
                    'created_at': str_solr_time,
                    'status_text': status.full_text
                }]
                self.__solr.index(self.__core, docs)
            print(str(count) + "," + keyword)
        code = iu.commit(iu.solr_core_tweets)
예제 #7
0
    def update_solr(self, task=None):
        solr = SolrClient(SOLR_URI + '/solr/')
        collection = 'listmanager'

        if not task:
            task = self.task

        document = {}
        document['id'] = task.id
        document['title'] = task.title
        document['note'] = task.note if task.note else ''
        #document['tag'] =[t for t in task.tag.split(',')] if task.tag else []
        document['tag'] =[k.name for k in task.keywords] # better this than relying on tag

        document['completed'] = task.completed != None
        document['star'] = task.star # haven't used this yet and schema doesn't currently reflect it

        #note that I didn't there was any value in indexing or storing context and folder
        document['context'] = task.context.title
        document['folder'] = task.folder.title

        json_docs = json.dumps([document])
        response = solr.index_json(collection, json_docs)

        # response = solr.commit(collection, waitSearcher=False) # doesn't actually seem to work
        # Since solr.commit didn't seem to work, substituted the below, which works
        url = SOLR_URI + '/solr/' + collection + '/update'
        r = requests.post(url, data={"commit":"true"})
        #print(r.text)
        root = ET.fromstring(r.text)
        if root[0][0].text == '0':
            print(self.colorize("solr update successful", 'yellow'))
        else:
            print(self.colorize("there was a problem with the solr update", 'yellow'))
예제 #8
0
def suggest():
    query_key = request.args.get('query')
    solr = SolrClient('http://localhost:8983/solr')
    res = solr.query('myexample',{
            'q':query_key,
        },'suggest')
    return json.dumps(res.data['suggest']['suggest'][query_key]['suggestions'])
예제 #9
0
 def test_solr_to_solr_with_date(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0],
                       devel=True,
                       auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr,
                           source_coll='source_coll',
                           dest=solr,
                           dest_coll='dest_coll',
                           date_field='index_date')
     reindexer.reindex()
     try:
         self.assertTrue(solr.transport._action_log[1]['params']['params']
                         ['sort'] == 'index_date asc, id desc')
     except KeyError:
         self.assertTrue(solr.transport._action_log[2]['params']['params']
                         ['sort'] == 'index_date asc, id desc')
     self.assertEqual(
         solr.query(self.colls[0], {
             'q': '*:*',
             'rows': 10000000
         }).docs.sort(key=lambda x: x['id']),
         solr.query(self.colls[1], {
             'q': '*:*',
             'rows': 10000000
         }).docs.sort(key=lambda x: x['id']),
     )
예제 #10
0
def search(query_dict):
    #pdb.set_trace()
    #instantiate solr connection
    solr = SolrClient('http://localhost:8983/solr')

    # Generic search if no query input given
    if len(query_dict) == 0:
        query_string = '*:*'
    #retrieve value of field in table and prepare a query string
    else:
        query_string = ''
        query_op = ' AND '
        item_count = 0
        for key in query_dict:
            if len(query_dict[key]) > 0:
                if item_count > 0:
                    query_string = query_string + query_op + key + ':' + query_dict[
                        key]
                else:
                    query_string = query_string + key + ':' + query_dict[key]
                item_count += 1
    res = solr.query('lyrics', {
        'q': query_string,
    })
    return res.data['response']['docs']
예제 #11
0
def update_tagrisk_scores(existing_tags: dict,
                          solr: SolrClient, core_name, batch_commit):
    count = 0
    batch = []
    for tag, data in existing_tags.items():
        count += 1
        if count > batch_commit:
            solr.index(core_name, batch)
            code = util.commit(core_name)
            count = 0
            batch = []
            logger.info("\t done batch size={}".format(batch_commit))

        freq = data[util.tag_index_field_frequency]
        freqh = data[util.tag_index_field_frequencyh]

        if freqh==0:
            riskscore=0
        else:
            riskscore = numpy.math.sqrt(freqh / (freq+ util.score_denominator_min))
        data[util.tag_index_field_risk_score] = riskscore
        data[util.tag_index_field_text] =tag
        data[util.tag_index_field_type] =0
        batch.append(data)

    # commit the rest
    solr.index(core_name, batch)
    code = util.commit(core_name)
예제 #12
0
 def test_solr_to_solr_reindex_and_resume_reverse(self):
     """
     Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     # This gets somehwat of a mid point date in the range.
     midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
     # Reindex approximately half of the data by restricting FQ
     reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")])
     sleep(10)
     # Make sure we have at least 20% of the data.
     dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
     s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
     self.assertTrue(s_count > dest_count > s_count * 0.20)
     reindexer.resume()
     sleep(10)
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
예제 #13
0
def detail_query(key):
    solr = SolrClient(current_app.config['SOLR'])
    q = 'id:{}'.format(key)
    res = solr.query(
        'scripties', {
            'q': q,
            'fl': 'titel,auteur,jaar,supervisor,type,faculteit,opleiding,taal',
        })
    return res
예제 #14
0
def index_data():

    docs = get_data()

    client = SolrClient('http://localhost:8983/solr')

    client.index_json('stocks', json.dumps(docs))

    client.commit('stocks')
예제 #15
0
def index_json():

    client = SolrClient('http://localhost:8983/solr')

    docs = [
        {'id' : '8', 'field8' : 'value8'},
    ]

    client.index_json('test', json.dumps(docs))
    client.commit('test')
예제 #16
0
 def setUpClass(self):
     logging.debug("Starting to run Reindexer Tests")
     self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                            devel=True,
                            auth=test_config['SOLR_CREDENTIALS'])
     self.colls = [
         test_config['SOLR_REINDEXER_COLLECTION_S'],
         test_config['SOLR_REINDEXER_COLLECTION_D']
     ]
     self.rand_docs = RandomTestData()
예제 #17
0
def get_solr():
    solr = SolrClient(current_app.config['SOLR'])
    res = solr.query('scripties', {
        'q': 'titel:muslim',
        'facet': True,
        'facet.field': 'taal',
    })
    return res.get_results_count()


# def get_post(id, check_author=True):
#     post = get_db().execute(
#         'SELECT p.id, title, body, created, author_id, username'
#         ' FROM post p JOIN user u ON p.author_id = u.id'
#         ' WHERE p.id = ?',
#         (id,)
#     ).fetchone()
#     if post is None:
#         abort(404, "Post id {0} doesn't exist.".format(id))
#     if check_author and post['author_id'] != g.user['id']:
#         abort(403)
#     return post

# @bp.route('/<int:id>/update', methods=('GET', 'POST'))
# @login_required
# def update(id):
#     post = get_post(id)
#     if request.method == 'POST':
#         title = request.form['title']
#         body = request.form['body']
#         error = None
#         if not title:
#             error = 'Title is required.'
#         if error is not None:
#             flash(error)
#         else:
#             db = get_db()
#             db.execute(
#                 'UPDATE post SET title = ?, body = ?'
#                 ' WHERE id = ?',
#                 (title, body, id)
#             )
#             db.commit()
#             return redirect(url_for('blog.index'))
#     return render_template('blog/update.html', post=post)

# @bp.route('/<int:id>/delete', methods=('POST',))
# @login_required
# def delete(id):
#     get_post(id)
#     db = get_db()
#     db.execute('DELETE FROM post WHERE id = ?', (id,))
#     db.commit()
#     return redirect(url_for('blog.index'))
예제 #18
0
def update_solr():
    def now():
        return datetime.now().isoformat(' ').split('.')[0]

    solr = SolrClient(SOLR_URI + '/solr/')
    collection = 'listmanager'
    solr_sync = remote_session.query(Sync).get('solr')
    last_solr_sync = solr_sync.timestamp
    log = f"{now()}: last Solr sync = {last_solr_sync.isoformat(' ').split('.')[0]}\n"
    tasks = remote_session.query(Task).filter(Task.modified > last_solr_sync)
    log = f"{now()}: number of tasks modified since "\
          f"last sync = {str(tasks.count())}\n" + log
    max = round(tasks.count(), -2) + 200
    i = -1
    s = 0
    for n in range(100, max, 100):

        documents = []
        for i, task in enumerate(tasks[s:n]):
            document = {}
            document['id'] = task.id
            document['title'] = task.title
            document['note'] = task.note if task.note else ''
            document['tag'] = [t for t in task.tag.split(',')
                               ] if task.tag else []

            document['completed'] = task.completed != None
            document[
                'star'] = task.star  # haven't used this yet and schema doesn't currently reflect it

            #note that I didn't there was any value in indexing or storing context and folder
            document['context'] = task.context.title
            document['folder'] = task.folder.title

            documents.append(document)

        json_docs = json.dumps(documents)
        response = solr.index_json(collection, json_docs)

        # response = solr.commit(collection, waitSearcher=False) # doesn't actually seem to work
        # Since solr.commit didn't seem to work, substituted the below, which works
        url = SOLR_URI + '/solr/' + collection + '/update'
        r = requests.post(url, data={"commit": "true"})
        #print(r.text)

        #print("Tasks {} to {}".format(s,n))
        s = n

    solr_sync.timestamp = datetime.now() + timedelta(seconds=2)
    remote_session.commit()
    log = f"{now()}: new Solr sync = "\
           f"{solr_sync.timestamp.isoformat(' ').split('.')[0]}\n" + log
    return log, i
예제 #19
0
 def test_index_bad_data(self):
     index = IndexQ(test_config['indexqbase'], 'testq')
     solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     if index._is_locked():
         index._unlock()
     self.assertEqual(index.get_all_as_list(),[])
     solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
     todo_file = index.add({'date':'asd'}, finalize=True)
     self.assertEqual(index.get_all_as_list()[0],todo_file)
     with self.assertRaises(SolrError):
         index.index(solr,test_config['SOLR_COLLECTION'])
     self.assertEqual(index.get_all_as_list()[0],todo_file)
     self.assertFalse(index._is_locked())
예제 #20
0
def read_all():

    client = SolrClient('http://localhost:8983/solr')

    res = client.query('test', {
        'q' : '*:*'
    })

    res = json.loads(res.get_json())
    docs = res['response']['docs']

    for doc in docs:
        print (doc)
예제 #21
0
def process_one(record_list, coll):
    url_solr = 'http://xxx.xxx.xxx.xx:xxxx/solr/%s/'% coll
    solrClient = SolrClient(url_solr)

    for record in record_list:
        htmlEntity = HtmlEntity()
        parser = ParserHtml()
        htmlEntity = parser.parseHtml(record, htmlEntity)
        solrClient.addDoc(htmlEntity)
    solrClient.addDocs()
    last_id = record_list[-1]['_id']
    _LOGGER_.info(last_id + ' is Done!')
    del record_list
예제 #22
0
    def build(self):
        try:
            CLIENT = SolrClient(SEARCH_ENGINE.get('URL'))
        except SolrError:
            print(
                "Solr не запущен, попробуйте выполнить команду: solr start -e cloud"
            )

        # http://lucene.apache.org/solr/guide/8_2/requestdispatcher-in-solrconfig.html
        # script = """
        # curl - H
        # 'Content-type:application/json' - d
        # '{"set-property":
        # {"requestDispatcher.requestParsers.enableRemoteStreaming": true}, "set-property":{"requestDispatcher.requestParsers.enableStreamBody": true}}'
        # http://localhost:8983/api/collections/infoportal/config
        # """
        # rc = call(script, shell=True)
        # print(f"Выполнение скрипта: {rc}")

        print("СТАТУС КЛАСТЕРА")
        print(f'CLIENT.collections={CLIENT.collections.clusterstatus()}')

        print('ЭКСПОРТ ДОКУМЕНТОВ postres')
        conn = create_connection()

        if conn is None:
            return {}

        try:
            with conn:
                cur = conn.cursor()

                cur.execute("SELECT * FROM documents_documents")
                self.rows = cur.fetchall()

                open(TMP_FILENAME, 'w').close()
                with open(TMP_FILENAME, "wb") as f:
                    f.write(bytes("[", encoding='utf-8'))
                    for row in self.rows:
                        document = SolrDocument(row)
                        f.write(bytes(document.toJSON(), encoding='utf-8'))
                        # print(bytes(json.dumps(document), encoding='utf-8'))
                        # f.write(bytes(json.dumps(document), encoding='utf-8'))
                        f.write(bytes(",", encoding='utf-8'))
                    f.write(bytes("]", encoding='utf-8'))
        except Exception as ex:
            print(f"{ex}")

        print('ИМПОРТ ДОКУМЕНТОВ В Solr')
        CLIENT.local_index('infoportal', TMP_FILENAME)
예제 #23
0
 def test_solr_to_solr_resume_checkonly(self):
     '''
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     '''
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll',
                           date_field='date')
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
     reindexer.resume(check=True)
     # Makes sure nothing got indexed
     self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
예제 #24
0
 def setUpClass(self):
     self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     self.rand_docs = RandomTestData()
     self.docs = self.rand_docs.get_docs(50)
     
     for field in test_config['collections']['copy_fields']:
         try:
             self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field)
         except:
             pass
     for field in test_config['collections']['fields']:
         try:
             self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field)
         except:
             pass
예제 #25
0
 def test_solr_to_solr_reindexer_per_shard(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.reindex()
     # sloppy check over here, will improve later
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
예제 #26
0
 def test_solr_to_solr_with_date(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
     )
     reindexer.reindex()
     try:
         self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc")
     except KeyError:
         self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc")
     self.assertEqual(
         solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
         solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
     )
예제 #27
0
 def test_solr_to_solr_reindex_and_resume_reverse(self):
     '''
     Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
     '''
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0],
                       auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr,
                           source_coll='source_coll',
                           dest=solr,
                           dest_coll='dest_coll',
                           date_field='date')
     #Make sure only source has data
     self.assertEqual(
         len(
             solr.query(self.colls[0], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs), 50000)
     self.assertEqual(
         len(
             solr.query(self.colls[1], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs), 0)
     #This gets somehwat of a mid point date in the range.
     midpoint = (datetime.datetime.now() - datetime.timedelta(days=(
         (self._end_date - self._start_date).days / 2)))
     #Reindex approximately half of the data by restricting FQ
     reindexer.reindex(
         fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')])
     sleep(10)
     #Make sure we have at least 20% of the data.
     dest_count = len(
         solr.query(self.colls[1], {
             'q': '*:*',
             'rows': 10000000
         }).docs)
     s_count = len(
         solr.query(self.colls[0], {
             'q': '*:*',
             'rows': 10000000
         }).docs)
     self.assertTrue(s_count > dest_count > s_count * .20)
     reindexer.resume()
     sleep(10)
     #Make sure countc match up after reindex
     self.assertEqual(
         len(
             solr.query(self.colls[0], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs),
         len(
             solr.query(self.colls[1], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs))
예제 #28
0
def get_latest_update(url,collection, query):
    dttm = None
    solr = SolrClient(url)
    res = solr.query(collection, {
            'q': query,
            'rows': 1,
            'sort': 'system_mtime desc'
    })
    pp.pprint(res.get_results_count())
    
    if res.get_results_count() == 1:
        pp.pprint(res.docs[0]['system_mtime'])
        date = res.docs[0]['system_mtime']
        dttm = datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ")
        pp.pprint(dttm)
    return dttm
예제 #29
0
def get_tweets_by_time(timespan, solr:SolrClient, core_name="tweets"):
    rows=100 #100 results per page
    stop=False
    start=0
    facet_counts=None

    q='created_at:' + timespan+' AND ml_tag:0'

    while not stop:
        res = solr.query(core_name, {
            'q':q, #remember we only show tweets tagged as hate (0)
            'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok
            'facet':"on", #switch on facet search
            'facet.mincount':"1", #show facets that have at least 1 result
            'rows':rows,
            'fl':'*',   #return all fields from the index (when available
            'start':start, #start from
            'sort':'tweet_risk desc'}) #sort by risk_score descending
        start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows
        print("total number found={}".format(res.num_found))
        if start>res.num_found:
            stop=True

        #assign facet results to another var. facet counts is for the whole dataset, not just this page
        if facet_counts is None:
            facet_counts=res.data['facet_counts']['facet_fields']['entities_hashtag']

        #now go through every page, every result
        for d in res.docs: #res.docs only contain documents on the CURRENT page
            print("https://twitter.com/"+d['user_screen_name']+"/"+d['id'])
            if 'coordinates' in d.keys():
                print(d['coordinates'])

    #finally print facet counts
    print(facet_counts)
예제 #30
0
def update(solr: SolrClient, tweet_core_name, tag_core_name, timespan, rows,
           feat_vectorizer, ml_model, selected_features,
           hate_indicative_features, scaling_option, sysout, logger):

    stop = False
    start = 0
    while not stop:
        logger.warn("Processing from {} for a batch of {}".format(start, rows))
        print("Processing from {} for a batch of {}".format(start, rows))
        res = solr.query(
            tweet_core_name, {
                'q': 'created_at:' + timespan,
                'rows': rows,
                'fl': '*',
                'start': start,
                'sort': 'id asc'
            })
        start += rows
        if start > res.num_found:
            stop = True

        #apply pretrained ML model to tag data and update them
        update_ml_tag(solr, tweet_core_name, tag_core_name, res.docs,
                      feat_vectorizer, ml_model, selected_features,
                      hate_indicative_features, scaling_option, sysout, logger)

    pass
예제 #31
0
 def setUpClass(self):
     self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                            devel=True,
                            auth=test_config['SOLR_CREDENTIALS'])
     self.rand_docs = RandomTestData()
     self.docs = self.rand_docs.get_docs(50)
     self.coll = test_config['SOLR_COLLECTION'] + str(random.random() * 100)
     self.temp_dir = test_config['temp_data']
     res, con_info = self.solr.collections.api(
         'create', {
             'name': self.coll,
             'numShards': 1,
             'replicationFactor': 1,
             'collection.configName': 'basic_configs'
         })
     self.zk = self.solr.get_zk()
예제 #32
0
def do_it():
    global omd
    global s3
    global ss  # the pickle
    global tmpdir
    global ctr
    global solr
    global repo_ctr
    omd = get_details()
    main_log.debug("temp: {} url: {} s3 yaml:{} ".format(
        omd.get('tmpdir'), omd.get('pdfurl'), omd.get('s3_yaml')))
    instance = omd.get('instance')
    main_log.info("Instance: " + instance)
    main_log.info("retrieving saved state, if any, at {}".format(
        omd.get("savedstate")))
    ss = savestate(omd.get("savedstate"))
    if all:
        ss.clear()
    solr = SolrClient(omd.get('solr_url'))
    tmpdir = omd.get('tmpdir')
    try:
        s3 = S3(configpath=omd.get('s3_yaml'))
    except Exception as e:
        raise e
    aspace = ASpace()
    for repo in aspace.repositories:
        if all or repo_code is None or repo.repo_code == repo_code:
            process_repository(repo)
            repo_ctr += 1
    ss.save()  # last time for good luck!
예제 #33
0
 def test_solr_to_solr_resume_checkonly(self):
     """
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.resume(check=True)
     # Makes sure nothing got indexed
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
예제 #34
0
def get_tags_by_pmi(target_tag, solr:SolrClient, core_name="tags"):
    #http://localhost:8983/solr/tags/select?indent=on&q=tag_text:banmuslims%20AND%20type:1&wt=json
    rows=100 #100 results per page
    stop=False
    start=0

    q='tag_text:' + target_tag+' AND type:1' #0=single tag; 1=tag pairs
    #because we need to get tags similar to this target, so we need to get all pairs and process them

    while not stop:
        res = solr.query(core_name, {
            'q':q, #remember we only show tweets tagged as hate (0)
            'rows':rows,
            'fl':'*',   #return all fields from the index (when available
            'start':start, #start from
            'sort':'pmi desc'}) #sort by risk_score descending
        start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows
        print("total number found={}".format(res.num_found))
        if start>res.num_found:
            stop=True

        #now go through every page, every result
        for d in res.docs: #res.docs only contain documents on the CURRENT page
            tags=d['tag_text'].split(" ")
            relevant_tag=tags[0]
            if relevant_tag==target_tag:
                relevant_tag=tags[1]
            print(relevant_tag+", pmi="+d['pmi'])
예제 #35
0
 def test_get_date_range_query(self):
     '''
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     '''
     solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll',
                           date_field='index_date')
     self.assertEqual(
         reindexer._get_date_range_query('2015-11-10', '2015-12-11'),
         {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date',
          'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY'}
     )
     self.assertEqual(
         reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123'),
         {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123',
          'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY'}
     )
     self.assertEqual(
         reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123', timespan='MONTH'),
         {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123',
          'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH'}
     )
     self.assertEqual(
         reindexer._get_date_range_query('2015-11-10', '2015-12-11', timespan='MONTH'),
         {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date',
          'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH'}
     )
예제 #36
0
    def test_index_dynamic_collections_basic_1(self):
        index = IndexQ(test_config['indexqbase'], 'testq')
        solr = SolrClient(test_config['SOLR_SERVER'],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        if index._is_locked():
            index._unlock()
        self.assertEqual(index.get_all_as_list(), [])

        # Set up mock for indexing
        temp = {}

        def mock(temp, coll, docs):
            temp[coll] = docs
            return True

        todo_file = index.add([
            {
                'type': '1',
                'data': '1'
            },
            {
                'type': '1',
                'data': '2'
            },
            {
                'type': '1',
                'data': '3'
            },
            {
                'type': '2',
                'data': '4'
            },
            {
                'type': '3',
                'data': '5'
            },
        ],
                              finalize=True)
        runner_wrap = index._wrap_dynamic(partial(mock, temp),
                                          lambda x: x['type'], todo_file)
        self.assertTrue(runner_wrap)
        self.assertEqual(json.loads(temp['3']), [{"data": "5", "type": "3"}])
        self.assertEqual(json.loads(temp['2']), [{'type': '2', 'data': '4'}])
        self.assertEqual(
            sorted(json.loads(temp['1']), key=lambda x: x['data']),
            sorted([{
                'type': '1',
                'data': '1'
            }, {
                'type': '1',
                'data': '2'
            }, {
                'type': '1',
                'data': '3'
            }],
                   key=lambda x: x['data']))
        self.assertFalse(
            index.get_all_as_list())  # Make sure item is completed
예제 #37
0
def get_solr_connection():
    '''Initialize a Solr connection using project settings'''
    # TODO: error handling on config not present?
    solr_config = settings.SOLR_CONNECTIONS['default']
    solr = SolrClient(solr_config['URL'])
    # NOTE: may want to extend SolrClient to set a default collection
    solr_collection = solr_config['COLLECTION']
    return solr, solr_collection
예제 #38
0
 def test_index(self):
     index = IndexQ(test_config['indexqbase'], 'testq')
     solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
     buff = []
     files = []
     for doc in self.docs:
         files.append(index.add(doc, finalize=True))
     index.index(solr,test_config['SOLR_COLLECTION'])
     solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
     for doc in self.docs:
         res = solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])})
         self.assertTrue(res.get_results_count()==1)
     solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
예제 #39
0
 def test_complete_compress_basic_re_indexing(self):
     log = logging.getLogger()
     solr = SolrClient(test_config['SOLR_SERVER'],
                       devel=True,
                       auth=test_config['SOLR_CREDENTIALS'])
     index = IndexQ(test_config['indexqbase'], 'testq', size = 1, log = log,
                    compress=True)
     solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')
     for item in self.docs[1:10]:
         index.add(item, finalize=True)
     index.index(solr, test_config['SOLR_COLLECTION'])
     # At this point items are indexed and are moved into the done directory
     # Lets re-index them to make sure all json got properly encoded
     files = index.get_all_as_list('_done_dir')
     for f in index.get_all_as_list('_done_dir'):
         shutil.move(f, index._todo_dir)
     index.index(solr, test_config['SOLR_COLLECTION'])
     self.assertEqual(files, index.get_all_as_list('_done_dir'))
예제 #40
0
 def test_solr_to_solr_resume_basic(self):
     """
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has datae
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.resume()
     sleep(10)
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
예제 #41
0
def computeScores1(type, query, output_file):
    solr = SolrClient('http://localhost:8983/solr')

    res = solr.query(query['index'], {
        'q': '*:*',
        'wt': 'json',
        'indent': True,
        'rows': 1000,
    })

    docs = res.data['response']['docs']

    with open(output_file, "wb") as outF:
        a = csv.writer(outF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        a.writerow(["type", "x-coordinate", "y-coordinate", "Similarity_score"])

        for doc in docs:
            for key in doc:
                if key in ["id", "_version_"]:
                    continue
                try:
                    doc[key] = doc[key][0].encode("ascii", "ignore")
                except:
                    doc[key] = str(doc[key][0]).decode("unicode_escape").encode("ascii", "ignore")

        doc_tuples = itertools.combinations(docs, 2)
        for raw1, raw2 in doc_tuples:

            doc1 = raw1.copy()
            doc2 = raw2.copy()

            if "Name" in doc1:
                row_cosine_distance = [type, doc1["Name"], doc2["Name"]]
            else:
                row_cosine_distance = [type, doc1["name"], doc2["name"]]

            v1 = Vector(row_cosine_distance[0], doc1)
            v2 = Vector(row_cosine_distance[1], doc2)

            row_cosine_distance.append(v1.cosTheta(v2))

            a.writerow(row_cosine_distance)
예제 #42
0
    def get(self):

        term = self.get_argument('term')

        client = SolrClient('http://localhost:8983/solr')
        res = client.query('stocks', {
            #'q' : 'symbol:%s' % '*'
            'q' : term
        })

        res = json.loads(res.get_json())
        docs = res['response']['docs']

        formatted = []

        for doc in docs:
            formatted.append({
                'name' : doc['name'],
                'symbol' : doc['symbol'],
                'sector' : doc['sector'],
                'open' : doc['open']
            })

        self.write(json.dumps(formatted))
예제 #43
0
 def setUpClass(self):
     self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                            devel=True,
                            auth=test_config['SOLR_CREDENTIALS'])
     self.rand_docs = RandomTestData()
     self.docs = self.rand_docs.get_docs(50)
     self.coll = test_config['SOLR_COLLECTION']+str(random.random()*100)
     self.temp_dir = test_config['temp_data']
     res, con_info = self.solr.collections.api('create', {
                             'name': self.coll,
                             'numShards': 1,
                             'replicationFactor': 1,
                             'collection.configName': 'basic_configs'
                             })
     sleep(2)
     self.zk = self.solr.get_zk()
예제 #44
0
class ZKTest(unittest.TestCase):
    #High level zk tests

    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.coll = test_config['SOLR_COLLECTION']+str(random.random()*100)
        self.temp_dir = test_config['temp_data']
        res, con_info = self.solr.collections.api('create', {
                                'name': self.coll,
                                'numShards': 1,
                                'replicationFactor': 1,
                                'collection.configName': 'basic_configs'
                                })
        sleep(2)
        self.zk = self.solr.get_zk()

    @classmethod
    def tearDownClass(self):
        res, con_info = self.solr.collections.api('delete', {'name':self.coll})

    def test_zk_get_collection_config_bad_collection(self):
        with self.assertRaises(ZookeeperError):
            self.zk.download_collection_configs('asdasdasd', self.temp_dir + os.sep + self.coll)

    def test_zk_copy_config(self):
        a = self.zk.copy_config('basic_configs', 'new_config')
        self.assertTrue(self.zk.kz.get('/configs/new_config'))
        self.zk.kz.delete('/configs/new_config', recursive=True)

    def test_download_collection_configs(self):
        # really bad test, need to rework later
        a = self.zk.download_collection_configs('basic_configs',
                                                self.temp_dir+'/configs')
        self.assertTrue(os.path.isdir(self.temp_dir+'/configs'))

    def test_upload_collection_configs(self):
        a = self.zk.upload_collection_configs('test1', self.temp_dir+'/configs/basic_configs')
        self.zk.kz.delete('/configs/test1', recursive=True)
예제 #45
0
 def setUpClass(self):
     self.solr = SolrClient(test_config['SOLR_SERVER'][0],devel=True,auth=test_config['SOLR_CREDENTIALS'])
     self.rand_docs = RandomTestData()
     self.docs = self.rand_docs.get_docs(50)
     self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
     
     for field in test_config['collections']['copy_fields']:
         try:
             self.solr.collections.delete_copy_field(test_config['SOLR_COLLECTION'],field)
         except:
             pass
     for field in test_config['collections']['fields']:
         try:
             self.solr.collections.create_field(test_config['SOLR_COLLECTION'],field)
         except:
             pass
     
     #Index Some data
     self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
     self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
예제 #46
0
Note: We do use album, album_art, uri, title, artist
We don't use duration or position and we don't use metadata, which doesn't fully match the metadata needed to play songs, which we get through wire shark

The upload format for SolrClient (python3 solr client program) is jsonifying a list of dictionaries:
[{'id':'After_The_Gold_Rush_Birds', 'artist':'Neil Young', 'title':'Birds', 'album':'After the Gold Rush',
'uri':'x-sonos-http:amz%3atr%3a44ce93d2-4105-416a-a905-51fe0f38ed9a.mp4?sid=26&flags=8224&sn=2'...}{...
'''

from SolrClient import SolrClient
import sys
import json
import requests
from config import ec_uri

solr = SolrClient(ec_uri+':8983/solr')
collection = 'sonos_companion'

file_name = input("What file do you want to use for uploading track information to solr?")

with open(file_name,'r') as f:
    z = f.read()

full_items = json.loads(z)
documents = []
for item in full_items:
    document = {}
    # We create a unique id but concatenating the album and the song title
    id_ = item['album'] + ' ' + item['title']
    id_ = id_.replace(' ', '_')
    document['id'] = id_
예제 #47
0
#!/usr/bin/env python

from __future__ import division
import json
import os
from SolrClient import SolrClient
import sys
from tika import detector

solr = SolrClient('http://localhost:8983/solr')
walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1]))
walk_i = 0
ratios = {}
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        path = root + '/' + file
        file_size = os.stat(path).st_size
        if file_size == 0: continue
        mime = detector.from_file(path)
        sum, n = ratios.get(mime, (0, 0))
        ratios[mime] = sum + len(json.dumps(solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'])) / file_size, n + 1
        walk_i += 1
        print str(walk_i * 100 // walk_n) + '%\r',
with open('size-diversity.json', 'w') as f:
    json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
예제 #48
0
 def test_access_without_auth(self):
     if not test_config['SOLR_CREDENTIALS'][0]:
         return
     solr = SolrClient(test_config['SOLR_SERVER'],devel=True)
     with self.assertRaises(ConnectionError) as cm:
         solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
예제 #49
0
 def __init__(self):
     self._logger = logging.getLogger(__name__)
     self.remote_solr_client = SolrClient(remote_solr_server, username="******", password="******")
     self.local_solr_client = SolrClient(local_solr_server)
예제 #50
0
class ClientTestIndexing(unittest.TestCase):
    #High Level Client Tests
    
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        
        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
                
    def setUp(self):
        self.delete_docs()
        self.commit()
    
    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
        self.commit()
        
    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
        sleep(5)
    
    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'],devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
            
    
    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        self.delete_docs()
        self.commit()
    
    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()
    
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    
    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries +=1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.assertTrue(1000/50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass   

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries +=1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass              
            
    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502):
            self.assertTrue(len(res.docs) == 50)
            queries +=1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass    
예제 #51
0
 def setUpClass(self):
     logging.debug("Starting to run Reindexer Tests")
     self.solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
     self.colls = [test_config["SOLR_REINDEXER_COLLECTION_S"], test_config["SOLR_REINDEXER_COLLECTION_D"]]
     self.rand_docs = RandomTestData()
예제 #52
0
class Integrator(object):
    """
    Provide integration with KNow Knowledge Portal as scheduled batch job
    """

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self.remote_solr_client = SolrClient(remote_solr_server, username="******", password="******")
        self.local_solr_client = SolrClient(local_solr_server)

    def batch_processing_product_issue_attachments(self):
        """
        query remote Solr server to retrieve all the attachment ids
        :return:
        """
        self._logger.info("starting to retrieving attachement urls and batch indexing textual attachments ...")
        # solrClient=SolrClient(remote_solr_server)

        batch_num = 10

        response = self.remote_solr_client.load_documents_by_custom_query('attachment_ids_txt:*', start=0,
                                                                          rows=batch_num)
        total_num = response['numFound']
        self._logger.info("total number of document with attachments: [%s]", total_num)

        # if total_num > batch_num :
        for start_index in range(0, total_num, batch_num):
            response = self.remote_solr_client.load_documents_by_custom_query('attachment_ids_txt:*',
                                                                              start=start_index,
                                                                              rows=batch_num)
            docs = response['docs']
            try:
                self.batch_indexing_documents(docs)
                self._logger.info("batch indexing documents. progress [%s]", start_index)
            except IntegrationException as error:
                self._logger.error("error batch processing while indexing!")
                raise

        self._logger.info("complete batch processing of documents. Documents has been indexed completely.")

    def batch_indexing_documents(self, docs):
        """
        batch process a number of attachments associated with product issue

        :param docs: dictionary, Solr document objects
        :return:
        """
        self._logger.info("batch processing and indexing [%s] product issues ..." % len(docs))

        docs_to_index = []

        for doc in docs:
            prod_issue_doc_id = doc['id']
            attachment_ids = doc['attachment_ids_txt'] if 'attachment_ids_txt' in doc else ''

            # domain specific metadata
            prod_issue = doc[
                'product_issue_details#productIssue_s'] if 'product_issue_details#productIssue_s' in doc else ''
            product = doc['product_issue_details#product_s'] if 'product_issue_details#product_s' in doc else ''
            prod_issue_location = doc[
                'product_issue_details#location_s'] if 'product_issue_details#location_s' in doc else ''
            prod_issue_owner = doc['product_issue_details#owner_s'] if 'product_issue_details#owner_s' in doc else ''

            location_type = doc['location#type_s'] if 'location#type_s' in doc else ''
            location_local_name = doc['location#localName_s'] if 'location#localName_s' in doc else ''

            metadata_dict = {"literal.product_issue_details#productIssue_s": prod_issue,
                             "literal.product_issue_details#product_s": product,
                             "literal.product_issue_details#location_s": prod_issue_location,
                             "literal.location#type_s": location_type,
                             "literal.product_issue_details#owner_s": prod_issue_owner,
                             "literal.location#localName_s": location_local_name,
                             "literal.prod_issue_doc_id_s": prod_issue_doc_id}

            for attachment_id in attachment_ids:
                attachment_url = self.request_attachment_url_by_id(attachment_id)
                if not is_url_accessible(attachment_url):
                    self._logger.warn("The attachment [%s] is not accessible.", attachment_url)
                    continue

                if is_image(attachment_url):
                    self._logger.warn("The attachment [%s] is image. Skip for indexing", attachment_url)
                    continue

                existing_doc = self.local_solr_client.load_document_by_id(attachment_url)
                try:
                    if existing_doc is None:
                        self._logger.debug("current doc is not exist. Indexing now...")
                        self.local_solr_client.update_document_by_url(attachment_url, metadata=metadata_dict)
                        self._logger.debug("new doc is indexed.")
                    else:
                        # if current doc is existed
                        #   update existing doc with possible new metadata
                        self._logger.debug("current doc is exist. update existing index now...")
                        existing_doc.update(metadata_dict)
                        self.local_solr_client.update_document_by_url(attachment_url, metadata=existing_doc)
                        self._logger.debug("updating of existing doc is complete.")
                except SolrError as solrError:
                    error_message = str(solrError)
                    self._logger.error(error_message)

                    if 'Conflict' in str(solrError):
                        error_message = "Conflict! Another process is running."
                    raise IntegrationException(error_message)

                    # config Solr for improved indexing speed
                    # self.solr_client.commit_all()

    @staticmethod
    def request_attachment_url_by_id(attachment_id):
        """
        request attachment url by attachement id
        :param attachment_id:
        :return: string, attachment url
        """
        _headers = {"Authorization": attachment_retrieval_api_auth_token}
        attachment_retrieval_get_api = attachment_retrieval_api

        r = requests.get(attachment_retrieval_get_api + "/" + str(attachment_id), headers=_headers)
        if r.status_code == 200:
            response = json.loads(r.text, encoding="utf-8")
            attachment_url = response["url"]
        else:
            raise Exception(r.reason)

        return attachment_url
예제 #53
0
class ReindexerTests(unittest.TestCase):

    # Methos to create the schema in the collections
    def create_fields(self):
        for coll in self.colls:
            logging.debug("Creating fields for {}".format(coll))
            for field in test_config["collections"]["fields"]:
                try:
                    self.solr.schema.create_field(coll, field)
                except ValueError:
                    # Filed already exists probably
                    pass

    def create_copy_fields(self):
        for coll in self.colls:
            logging.debug("Creating copy fields for {}".format(coll))
            for field in test_config["collections"]["copy_fields"]:
                try:
                    self.solr.schema.create_copy_field(coll, field)
                except ValueError:
                    # Filed already exists probably
                    pass

    def setUp(self):
        [self.solr.delete_doc_by_id(coll, "*") for coll in self.colls]
        [self.solr.commit(coll, openSearcher=True) for coll in self.colls]

    def _index_docs(self, numDocs, coll):
        """
        Generates and indexes in random data while maintaining counts of items in various date ranges.

        These counts in self.date_counts are used later to validate some reindexing methods. 

        Brace yourself or have a drink.....
        """
        self.docs = self.rand_docs.get_docs(numDocs)
        sdate = datetime.datetime.now() - datetime.timedelta(days=180)
        edate = datetime.datetime.now() + datetime.timedelta(days=30)
        self._start_date = sdate
        self._end_date = edate

        import random

        # Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges
        hours = (edate - sdate).days * 24
        hour_range = [x for x in range(int(hours))]
        self.date_counts = {}

        # Save the newest and oldest timestamps as well as assign them to first and second doc
        self.docs[0]["date"] = sdate.isoformat() + "Z"
        self.date_counts[sdate.date().isoformat()] = 1

        self.docs[1]["date"] = edate.isoformat() + "Z"
        self.date_counts[edate.date().isoformat()] = 1

        for doc in self.docs[2:]:
            # Make a new date and store a count of it so I can compare later
            new_date = sdate + datetime.timedelta(hours=random.choice(hour_range))
            new_date_s = new_date.date().isoformat()
            if new_date_s in self.date_counts:
                self.date_counts[new_date_s] += 1
            else:
                self.date_counts[new_date_s] = 1
            doc["date"] = new_date.isoformat() + "Z"

        self.solr.index_json(coll, json.dumps(self.docs))
        self.solr.commit(coll, openSearcher=True)
        time.sleep(10)

    def get_all_json_from_indexq(self, index):
        files = index.get_all_as_list()
        out = []
        for efile in files:
            if efile.endswith(".gz"):
                f = gzip.open(efile, "rt", encoding="utf-8")
            else:
                f = open(efile)
            f_data = json.load(f)
            f.close()
            out.extend(f_data)
        return out

    @classmethod
    def setUpClass(self):
        logging.debug("Starting to run Reindexer Tests")
        self.solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        self.colls = [test_config["SOLR_REINDEXER_COLLECTION_S"], test_config["SOLR_REINDEXER_COLLECTION_D"]]
        self.rand_docs = RandomTestData()

    def test_solr_to_indexq(self):
        """
        Will export documents from Solr and put them into an IndexQ. 
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        for dir in ["_todo_dir", "_done_dir"]:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        self._index_docs(5000, self.colls[0])
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        from_solr = self.solr.query("source_coll", {"q": "*:*", "rows": 5000}).docs
        from_solr = reindexer._trim_fields(from_solr)
        self.assertEqual(sorted(from_files, key=lambda x: x["id"]), sorted(from_solr, key=lambda x: x["id"]))

    def test_ignore_fields(self):
        """
        Will export documents from Solr and put them into an IndexQ. 
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        for dir in ["_todo_dir", "_done_dir"]:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
        for field in ["_version_", "product_name_exact"]:
            self.assertTrue(field in reindexer._ignore_fields)

    def test_ignore_fields_disable(self):
        """
        Checks to make sure ignore_fields override works
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index, ignore_fields=False)
        self.assertEqual(reindexer._ignore_fields, False)

    def test_ignore_fields_override(self):
        """
        Checks to make sure ignore_fields override works
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        reindexer = Reindexer(
            source=self.solr, source_coll="source_coll", dest=index, ignore_fields=["_text_", "_any_other_field"]
        )
        self.assertEqual(reindexer._ignore_fields, ["_text_", "_any_other_field"])

    def test_get_copy_fields(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter")
        self.assertEqual(
            reindexer._get_copy_fields(),
            [field["dest"] for field in self.solr.schema.get_schema_copyfields(self.colls[0])],
        )

    def test_query_gen(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter")
        self.assertEqual(
            reindexer._get_query("cursor"),
            {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc"},
        )

    def test_query_gen_pershard(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(
            source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", per_shard=True
        )
        self.assertEqual(
            reindexer._get_query("cursor"),
            {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc", "distrib": "false"},
        )

    def test_query_gen_date(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(
            source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", date_field="ddddd"
        )
        self.assertEqual(
            reindexer._get_query("cursor"),
            {
                "cursorMark": "cursor",
                "rows": reindexer._rows,
                "q": "*:*",
                "sort": "id desc",
                "sort": "ddddd asc, id desc",
            },
        )

    def test_remove_copy_fields_from_data(self):
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        for dir in ["_todo_dir", "_done_dir"]:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        excluded_fields = reindexer._ignore_fields
        for doc in from_files:
            for field in excluded_fields:
                if field in doc:
                    print(doc)
                # self.assertTrue(field not in doc)

    def test_solr_to_solr(self):
        self._index_docs(50000, self.colls[0])
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=self.solr, dest_coll="dest_coll")
        reindexer.reindex()
        self.assertEquals(
            self.solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
            self.solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
        )

    def test_solr_to_solr_with_date(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
        )
        reindexer.reindex()
        try:
            self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc")
        except KeyError:
            self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc")
        self.assertEqual(
            solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
            solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
        )

    def test_get_edge_date(self):
        """
        Checks to make sure _get_edge_date returns correct start and end dates. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
        )
        solr_end_date_string = reindexer._get_edge_date("date", "desc")
        solr_start_date_string = reindexer._get_edge_date("date", "asc")
        self.assertTrue(
            self._start_date.date(), datetime.datetime.strptime(solr_start_date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
        )
        self.assertTrue(
            self._end_date.date(), datetime.datetime.strptime(solr_end_date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
        )

    def test_get_date_range_query(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "index_date",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1DAY",
            },
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "date123",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1DAY",
            },
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123", timespan="MONTH"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "date123",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1MONTH",
            },
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11", timespan="MONTH"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "index_date",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1MONTH",
            },
        )

    def test_get_date_facet_counts(self):
        """
        Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts(
            "DAY", "date", start_date=self._start_date.date().isoformat()
        )
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_without_start_date(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts("DAY", "date")
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_not_day(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Testing this one
        with self.assertRaises(ValueError):
            source_facet, dest_facet = reindexer._get_date_facet_counts("MONTH", "date")

    ## These tests are focused on methods related to resuming re-indexing

    def test_solr_to_solr_resume_checkonly(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has data
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        reindexer.resume(check=True)
        # Makes sure nothing got indexed
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)

    def test_solr_to_solr_resume_basic(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has datae
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        reindexer.resume()
        sleep(10)
        # Make sure countc match up after reindex
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )

    def test_solr_to_solr_reindex_and_resume(self):
        """
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has datae
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        # This gets somehwat of a mid point date in the range.
        midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
        # Reindex approximately half of the data by restricting FQ
        reindexer.reindex(fq=["date:[* TO {}]".format(midpoint.isoformat() + "Z")])
        sleep(10)
        # Make sure we have at least 20% of the data.
        dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
        s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
        self.assertTrue(s_count > dest_count > s_count * 0.20)
        reindexer.resume()
        sleep(10)
        # Make sure countc match up after reindex
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )

    def test_solr_to_solr_reindex_and_resume_reverse(self):
        """
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has data
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        # This gets somehwat of a mid point date in the range.
        midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
        # Reindex approximately half of the data by restricting FQ
        reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")])
        sleep(10)
        # Make sure we have at least 20% of the data.
        dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
        s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
        self.assertTrue(s_count > dest_count > s_count * 0.20)
        reindexer.resume()
        sleep(10)
        # Make sure countc match up after reindex
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )

    def test_solr_to_solr_reindexer_per_shard(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date"
        )
        # Make sure only source has data
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        reindexer.reindex()
        # sloppy check over here, will improve later
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )
예제 #54
0
'''
Create a playlist manually by entering songs one at a time
and searching solr for the particular song
There is also create_playlist_from_queue.py that has you put the songs on the queue
(from a playlist or whatever) and creates a playlist from the queue 
'''
from SolrClient import SolrClient
from config import ec_uri

solr = SolrClient(ec_uri+':8983/solr')
collection = 'sonos_companion'

track_title = input("\nwhat is the title of the track that you are looking for? ")
s = 'title:' + ' AND title:'.join(track_title.split())
result = solr.query(collection, {'q':s, 'rows':10, 'fl':['score', 'id', 'uri', 'title', 'artist', 'album'], 'sort':'score desc'}) 
tracks = result.docs
count = result.get_results_count()
if count==0:
    print("Didn't find any tracks\n")
elif count==1:
    track = tracks[0]
    try:
        print('id: ' + track['id'])
        print('artist: ' + track['artist'])
        print('album: ' + track['album'])
        print('song: ' + track['title'])
        print('uri: ' + track['uri'])
    except Exception as e:
        print(e)
    print('------------------------------------------------------------------------------------------------')
else:    
예제 #55
0
    documents = []
    n=1
    for track in queue:
        title = track.title
        uri = track.uri
        id_ = album + ' ' + title
        id_ = id_.replace(' ', '_')
        id_ = id_.lower()
        document = {"id":id_, "title":title, "uri":uri, "album":album, "artist":artist, "track":n}
        print(repr(document).encode('cp1252', errors='replace')) 
        for k in document:
            print(str(k+':'+str(document[k])).encode('cp1252', errors='ignore'))
        documents.append(document)
        n+=1

    solr = SolrClient(solr_uri+'/solr')
    collection = 'sonos_companion'

    response = solr.index_json(collection, json.dumps(documents))
    print(response)

    # Since solr.commit didn't seem to work, substituted the below, which works
    url = solr_uri+"/solr/"+collection+"/update"
    r = requests.post(url, data={"commit":"true"})
    print(r.text)

    resp = input("Do you want to continue? (y or n) ")
    if resp not in ('y', 'yes'):
        cont = False

######################################################################
예제 #56
0
import time
from time import sleep
import random
import json
import argparse
import sys
import datetime
home = os.path.split(os.getcwd())[0]
sys.path = [os.path.join(home, 'SoCo')] + sys.path
import soco
from soco import config
import boto3 
import config as c
from SolrClient import SolrClient

solr = SolrClient(c.ec_uri+':8983/solr')
collection = 'sonos_companion'

parser = argparse.ArgumentParser(description='Command line options ...')
parser.add_argument('--player', '-p', default='all', help="This is the name of the player you want to control or all")
args = parser.parse_args()

s3 = boto3.resource('s3')
object = s3.Object('sonos-scrobble','location')
location = object.get()['Body'].read()
print("The current location is {}".format(location))

sqs = boto3.resource('sqs', region_name='us-east-1') 
queue_name = 'echo_sonos_ct' if location==b'ct' else 'echo_sonos'
sqs_queue = sqs.get_queue_by_name(QueueName=queue_name) 
예제 #57
0
from solr import *
import pysolr

#conn = solr.solr("http://solr.example.net/solr")
#conn = solr.Solr("http://solr.example.net/solr")
#solr.SearchHandler(conn,"/select")
#conn.query()
import sklearn
from SolrClient import SolrClient

solr=SolrClient('http://192.168.1.100:8983/solr/')

result=solr.query('tableAbstract',{'q':'memBody:blood','facet':True,'facet.range.start':0,'facet.range.end':1000000})
for x in result.docs:
    #print(x['id'])
    print(int(float(x['id'])))
    #print(x['id'])
print (result.get_num_found())