def index_docs(docs, solr_url, corpus, buffer_size=2000): from solr import Solr solr = Solr(solr_url) docs = (seg for doc in docs for seg in doc.to_rec_dicts()) def set_corpus(doc): doc['corpus'] = corpus return doc docs = map(set_corpus, docs) solr.post_iterator(docs, buffer_size=buffer_size)
def index(self, docs, solr_url): solr = Solr(solr_url) success, count = solr.post_iterator(docs) if success: print("Indexed %d docs" % count) else: print("Error: Indexing failed, check solr logs")
def solrIngest(URL, dataset=None, inputDir=None, accept=None): solr = Solr(URL) documents = [] if dataset: documents = lazyDataset(dataset) elif inputDir: documents = lazySolr(inputDir, accept) count, res = solr.post_iterator(documents, commit=True, buffer_size=100) print(("Res : %s; count=%d" % (res, count)))
def index(self, solr_url, in_file): ''' Reads annotations at the specified path and indexes them to solr @param solr_url Target Solr URL to index @param in_file CSV file having text file and annotation file paths ''' solr = Solr(solr_url) recs = self.read_records(in_file) count, success, = solr.post_iterator(recs) if success: print("Indexed %d docs" % count) else: print("Error: Failed. Check solr logs")
def solrIngest(URL, dataset=None, inputDir=None, accept=None): solr = Solr(URL) documents = [] if dataset: documents = lazyDataset(dataset) elif inputDir: documents = lazySolr(inputDir, accept) count, res = solr.post_iterator(documents, commit=True, buffer_size=100) print("Res : %s; count=%d" % (res, count))
objects = [] scores = [] for obj, confd in row.items(): for o in obj.split(","): objects.append(o.strip()) scores.append(confd) delta['objects'] = {'set' : objects} delta['confidence'] = {'set': scores} yield delta if __name__ == '__main__': # Get the CSV file from classifier-local.py if len(sys.argv) != 2: print("required args:\n <CSV_file>") sys.exit(1) infile = sys.argv[1] min_confidence = 0.30 print("Reading from %s, Min confidence=%f" % (infile, min_confidence)) solr_url = "http://localhost:8983/solr/imagecatdev" solr = Solr(solr_url) updates = generate_solr_updates(infile, min_confidence=min_confidence) count, res = solr.post_iterator(updates, commit=True, buffer_size=1000) print("Res : %s; count=%d" %(res, count)) ''' from pprint import pprint for u in updates: pprint(u) '''
o.write("\n") count += 1 return count def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://imagecat.dyndns.org:8983/solr/imagecatdev" filename = "docs.docs.jsonl" solr = Solr(url) docs = solr.query_iterator("lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]", rows=1000, fl='id') count = store_stream(docs, filename) print("Wrote %d docs to %s" % (count, filename)) docs = read_stream(filename) updates = remove_last_modified(docs) count, success = solr.post_iterator(updates, False) print(success) print(count)
def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://imagecat.dyndns.org:8983/solr/imagecatdev" filename = "docs.docs.jsonl" solr = Solr(url) docs = solr.query_iterator( "lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]", rows=1000, fl='id') count = store_stream(docs, filename) print("Wrote %d docs to %s" % (count, filename)) docs = read_stream(filename) updates = remove_last_modified(docs) count, success = solr.post_iterator(updates, False) print(success) print(count)
u['phonenumbers'] = {'set': d['ner_phone_number_ts_md']} u['ner_phone_number_ts_md'] = {'set': None} else: print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator("ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)
print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator( "ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)