Ejemplo n.º 1
0
def index_docs(docs, solr_url, corpus, buffer_size=2000):
    from solr import Solr
    solr = Solr(solr_url)
    docs = (seg for doc in docs for seg in doc.to_rec_dicts())

    def set_corpus(doc):
        doc['corpus'] = corpus
        return doc

    docs = map(set_corpus, docs)
    solr.post_iterator(docs, buffer_size=buffer_size)
Ejemplo n.º 2
0
 def index(self, docs, solr_url):
     solr = Solr(solr_url)
     success, count = solr.post_iterator(docs)
     if success:
         print("Indexed %d docs" % count)
     else:
         print("Error: Indexing failed, check solr logs")
Ejemplo n.º 3
0
 def index(self, docs, solr_url):
     solr = Solr(solr_url)
     success, count = solr.post_iterator(docs)
     if success:
         print("Indexed %d docs" % count)
     else:
         print("Error: Indexing failed, check solr logs")
Ejemplo n.º 4
0
def solrIngest(URL, dataset=None, inputDir=None, accept=None):

    solr = Solr(URL)
    documents = []

    if dataset:
        documents = lazyDataset(dataset)
    elif inputDir:
        documents = lazySolr(inputDir, accept)

    count, res = solr.post_iterator(documents, commit=True, buffer_size=100)

    print(("Res : %s; count=%d" % (res, count)))
Ejemplo n.º 5
0
 def index(self, solr_url, in_file):
     '''
     Reads annotations at the specified path and indexes them to solr
     @param solr_url Target Solr URL to index
     @param in_file CSV file having text file and annotation file paths
     '''
     solr = Solr(solr_url)
     recs = self.read_records(in_file)
     count, success, = solr.post_iterator(recs)
     if success:
         print("Indexed %d docs" % count)
     else:
         print("Error: Failed. Check solr logs")
Ejemplo n.º 6
0
def solrIngest(URL, dataset=None, inputDir=None, accept=None):

    solr = Solr(URL)
    documents = []

    if dataset:
        documents = lazyDataset(dataset)
    elif inputDir:
        documents = lazySolr(inputDir, accept)

    count, res = solr.post_iterator(documents, commit=True, buffer_size=100)

    print("Res : %s; count=%d" % (res, count))
 def index(self, solr_url, in_file):
     '''
     Reads annotations at the specified path and indexes them to solr
     @param solr_url Target Solr URL to index
     @param in_file CSV file having text file and annotation file paths
     '''
     solr = Solr(solr_url)
     recs = self.read_records(in_file)
     count, success, = solr.post_iterator(recs)
     if success:
         print("Indexed %d docs" % count)
     else:
         print("Error: Failed. Check solr logs")
Ejemplo n.º 8
0
                objects = []
                scores = []
                for obj, confd in row.items():
                    for o in obj.split(","):
                        objects.append(o.strip())
                        scores.append(confd)
            delta['objects'] = {'set' : objects}
            delta['confidence'] = {'set': scores}
            yield delta


if __name__ == '__main__':
    # Get the CSV file from classifier-local.py
    if len(sys.argv) != 2:
        print("required args:\n <CSV_file>")
        sys.exit(1)
    infile = sys.argv[1]
    min_confidence = 0.30
    print("Reading from %s, Min confidence=%f" % (infile, min_confidence))
    solr_url = "http://localhost:8983/solr/imagecatdev"
    solr = Solr(solr_url)

    updates = generate_solr_updates(infile, min_confidence=min_confidence)
    count, res = solr.post_iterator(updates, commit=True, buffer_size=1000)
    print("Res : %s; count=%d" %(res, count))
    '''
    from pprint import pprint
    for u in updates:
      pprint(u)
    '''
Ejemplo n.º 9
0
            o.write("\n")
            count += 1
        return count

def read_stream(filename):
    '''
    Reads json line stream
    :param filename: path to json line
    :return: doc stream
    '''
    with open(filename) as inf:
        for l in inf:
            yield json.loads(l)

if __name__ == '__main__':
    url = "http://imagecat.dyndns.org:8983/solr/imagecatdev"
    filename = "docs.docs.jsonl"
    solr = Solr(url)
    docs = solr.query_iterator("lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]",
                        rows=1000, fl='id')

    count = store_stream(docs, filename)
    print("Wrote %d docs to %s" % (count, filename))
    docs = read_stream(filename)
    updates = remove_last_modified(docs)

    count, success = solr.post_iterator(updates, False)
    print(success)
    print(count)

Ejemplo n.º 10
0

def read_stream(filename):
    '''
    Reads json line stream
    :param filename: path to json line
    :return: doc stream
    '''
    with open(filename) as inf:
        for l in inf:
            yield json.loads(l)


if __name__ == '__main__':
    url = "http://imagecat.dyndns.org:8983/solr/imagecatdev"
    filename = "docs.docs.jsonl"
    solr = Solr(url)
    docs = solr.query_iterator(
        "lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]",
        rows=1000,
        fl='id')

    count = store_stream(docs, filename)
    print("Wrote %d docs to %s" % (count, filename))
    docs = read_stream(filename)
    updates = remove_last_modified(docs)

    count, success = solr.post_iterator(updates, False)
    print(success)
    print(count)
Ejemplo n.º 11
0
            u['phonenumbers'] = {'set': d['ner_phone_number_ts_md']}
            u['ner_phone_number_ts_md'] = {'set': None}
        else:
            print("Error: Skipped")
            continue
        yield u


def read_stream(filename):
    '''
    Reads json line stream
    :param filename: path to json line
    :return: doc stream
    '''
    with open(filename) as inf:
        for l in inf:
            yield json.loads(l)

if __name__ == '__main__':
    url = "http://127.0.0.1:8983/solr/imagecatdev"
    solr = Solr(url)
    docs = solr.query_iterator("ner_phone_number_t_md:* OR ner_phone_number_ts_md:*",
                        rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc")

    updates = fix_phonenumbers(docs)
    count, success = solr.post_iterator(updates, False, buffer_size=1000)
    solr.commit()
    print(success)
    print(count)

Ejemplo n.º 12
0
            print("Error: Skipped")
            continue
        yield u


def read_stream(filename):
    '''
    Reads json line stream
    :param filename: path to json line
    :return: doc stream
    '''
    with open(filename) as inf:
        for l in inf:
            yield json.loads(l)


if __name__ == '__main__':
    url = "http://127.0.0.1:8983/solr/imagecatdev"
    solr = Solr(url)
    docs = solr.query_iterator(
        "ner_phone_number_t_md:* OR ner_phone_number_ts_md:*",
        rows=1000,
        fl='id,ner_phone_number_t_md,ner_phone_number_ts_md',
        sort="indexedAt asc")

    updates = fix_phonenumbers(docs)
    count, success = solr.post_iterator(updates, False, buffer_size=1000)
    solr.commit()
    print(success)
    print(count)