def main(options):
    
    # get availalbe defined fiels
    data = req('%s/admin/luke?wt=json&show=schema&indent=true' % options.query_endpoint, **dict(show='schema'))
    fields = data['schema']['fields'].keys()
    
    if os.path.exists('term-freqs.txt') and EXISTING_TERMS_REPLACE or not os.path.exists('term-freqs.txt'):
        
        # for each retrive the high/med/low freq terms
        with csv_writer('term-freqs.txt', ['field', 'type', 'token', 'freq']) as writer:
            
            
            for f in fields:
                try:
                    print 'Getting freqs for: %s' % f
                    
                    rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS})
                    
                    high_freq =  dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2]))
                    write_terms(writer, f, 'high', high_freq)
                    
                    if len(high_freq.values()) == 0:
                        continue
                    
                    max_count = int(max(0.1, min(high_freq.values())) / 2) - 1
                    
                    if max_count < 1:
                        continue
                    
                    rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count})
                    
                    med_freq =  dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2]))
                    write_terms(writer, f, 'med', med_freq)
                    
                    if len(med_freq.values()) == 0:
                        continue
                    
                    max_count = max(int(max(0.1, min(med_freq.values())) / 2) - 1, 1)
                    
                    if max_count < 1:
                        continue
                    
                    rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count})
                    
                    low_freq =  dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2]))
                    write_terms(writer, f, 'low', low_freq)
                    
                except Exception, e:
                    print 'Error getting terms for: %s' % f
                    traceback.print_exc()
def main(options):
    
    # get availalbe defined fiels
    data = req('%s/admin/luke?wt=json&show=schema&indent=true' % options.query_endpoint, **dict(show='schema'))
    fields = data['schema']['fields'].keys()
    
    if os.path.exists('term-freqs.txt') and EXISTING_TERMS_REPLACE or not os.path.exists('term-freqs.txt'):
        retrieve_term_freqs(options, fields)
    
    
    if os.path.exists('phrase-freqs.txt.2') and EXISTING_TERMS_REPLACE \
        or not os.path.exists('phrase-freqs.txt.2'):
        retrieve_pseudo_collocations(options, maxlen=[2, 5], stop_after_reaching=10000,
                                     output_name='phrase-freqs.txt') 
    

    generate_field_queries(options)
    generate_wild_queries(options)
    
    generate_phrase_queries(options, length=2, input='phrase-freqs.txt.2')
    generate_phrase_queries(options, length=5, input='phrase-freqs.txt.5')
    
    generate_fuzzy_queries(options, length=1, input='phrase-freqs.txt.2')
    generate_fuzzy_queries(options, length=2, input='phrase-freqs.txt.2')
    
    generate_near_queries(options, length=2, input='phrase-freqs.txt.5')
    generate_near_queries(options, length=4, input='phrase-freqs.txt.5')
    
    generate_boolean_queries(options, 'AND', length=5, input='phrase-freqs.txt.2')
    generate_boolean_queries(options, 'AND', length=2, input='phrase-freqs.txt.2')
    
    generate_boolean_queries(options, 'OR', length=5, input='phrase-freqs.txt.2')
    generate_boolean_queries(options, 'OR', length=2, input='phrase-freqs.txt.2')
Exemple #3
0
def retrieve_term_freqs(options, fields):
    # for each retrive the high/med/low freq terms
    fo, writer = csv_writer('term-freqs.txt', ['field', 'type', 'token', 'freq'])
    for f in fields:
        try:
            print 'Getting freqs for: %s' % f
            
            rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS})
            
            high_freq =  dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2]))
            write_terms(writer, f, 'high', high_freq)
            
            if len(high_freq.values()) == 0:
                continue
            
            max_count = int(max(0.1, min(high_freq.values())) / 2) - 1
            
            if max_count < 1:
                continue
            
            rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count})
            
            med_freq =  dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2]))
            write_terms(writer, f, 'med', med_freq)
            
            if len(med_freq.values()) == 0:
                continue
            
            max_count = max(int(max(0.1, min(med_freq.values())) / 2) - 1, 1)
            
            if max_count < 1:
                continue
            
            rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count})
            
            low_freq =  dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2]))
            write_terms(writer, f, 'low', low_freq)
            
        except Exception, e:
            print 'Error getting terms for: %s' % f
            traceback.print_exc()
Exemple #4
0
def main(options):
    
    # get availalbe defined fiels
    data = req('%s/admin/luke?wt=json&show=schema&indent=true' % options.query_endpoint, **dict(show='schema'))
    fields = data['schema']['fields'].keys()
    
    if os.path.exists('term-freqs.txt') and EXISTING_TERMS_REPLACE or not os.path.exists('term-freqs.txt'):
        retrieve_term_freqs(options, fields)
    
    
    if os.path.exists('phrase-freqs.txt') and EXISTING_TERMS_REPLACE \
        or not os.path.exists('phrase-freqs.txt'):
        retrieve_pseudo_collocations(options, maxlen=2, stop_after_reaching=100000,
                                     output_name='phrase-freqs.txt') 
    
    if os.path.exists('phrase5-freqs.txt') and EXISTING_TERMS_REPLACE \
        or not os.path.exists('phrase5-freqs.txt'):
        retrieve_pseudo_collocations(options, maxlen=5, stop_after_reaching=100000,
                                     output_name='phrase5-freqs.txt')


    generate_field_queries(options)
    generate_wild_queries(options)
    
    generate_phrase_queries(options, length=2, input='phrase-freqs.txt')
    generate_phrase_queries(options, length=5, input='phrase5-freqs.txt')
    
    generate_fuzzy_queries(options, length=1, input='phrase-freqs.txt')
    generate_fuzzy_queries(options, length=2, input='phrase-freqs.txt')
    
    generate_near_queries(options, length=2, input='phrase5-freqs.txt')
    generate_near_queries(options, length=4, input='phrase5-freqs.txt')
    
    generate_boolean_queries(options, 'AND', length=5, input='phrase5-freqs.txt')
    generate_boolean_queries(options, 'AND', length=2, input='phrase5-freqs.txt')
    
    generate_boolean_queries(options, 'OR', length=5, input='phrase5-freqs.txt')
    generate_boolean_queries(options, 'OR', length=2, input='phrase5-freqs.txt')
Exemple #5
0
def retrieve_pseudo_collocations(options, max_time=600, 
                                 maxlen=3, stop_after_reaching=100000,
                                 max_clauses=2,
                                 upper_limit='1.0',
                                 lower_limit='0.97',
                                 output_name='collocations-freqs.txt'):
    
    fo, writer = csv_writer(output_name, ['field', 'type', 'token', 'freq'])
    
    terms = {}
    for fn in DISCOVER_PHRASES_FIELDS:
        terms[fn] = []
        
    for term in csv_reader('term-freqs.txt', generic=True):
        if len(term) != 4:
            continue
        if term[1] == 'high' and term[0] in DISCOVER_PHRASES_FIELDS:
            terms[term[0]].append(term[2])
              
    jobs = {}  
    for fn in DISCOVER_PHRASES_FIELDS:
        if fn not in terms:
            'skipping: %s as we have no data for it' % fn
            continue
        # register job
        rsp = req("%s/batch" % options.query_endpoint,
            command="find-freq-phrases",
            maxlen=maxlen,
            upperLimit=upper_limit,
            lowerLimit=lower_limit,
            fields=fn,
            maxClauses=max_clauses,
            stopAfterReaching=stop_after_reaching,
            )
        jobs[fn] = rsp['jobid']
        
        # first write terms to disk
        fi, tmpfile = tempfile.mkstemp()
        fd = open(tmpfile, 'w')
        fd.write("\n".join(terms[fn]))
        fd.close()
        
        kwdata = dict(endpoint = options.query_endpoint, jobid=rsp['jobid'], tmpfile=tmpfile)
        run_cmd(["curl '%(endpoint)s/batch?command=receive-data&jobid=%(jobid)s' --data-binary @%(tmpfile)s -H 'Content-type:text/txt; charset=utf-8'" 
                 % kwdata])
        
    
    # start processing
    rsp = req("%s/batch" % options.query_endpoint,
              command="start")
    
    some_future = time.time() + max_time
    jobs_finished = {}
    while time.time() < some_future:
        if len(jobs) == 0:
            break
        for k, v in jobs.items():
            rsp = req("%s/batch" % options.query_endpoint,
              command="status",
              jobid=v)
            if rsp['job-status'] == 'failed':
                error("Failed executing: %s - %s" % (k,v))
            elif rsp['job-status'] == 'finished':
                print 'finished: %s' % k
                del jobs[k]
                jobs_finished[k] = v
            else:
                time.sleep(3)
    
    
    for k,v in jobs_finished.items():
        run_cmd(["curl -o %s '%s/batch?command=get-results&jobid=%s'"
                  % ('collocations.%s.freq' % k, options.query_endpoint, v) 
                 ])
        with open('collocations.%s.freq' % k, 'r') as c_file:
            for line in c_file:
                data = line.strip().split('\t')
                if len(data) > 1:
                    writer.writerow([k, 'high', data[0], data[1]])
        os.remove('collocations.%s.freq' % k)
    
    fo.close()
def retrieve_pseudo_collocations(options,  
                                 maxlen=[3], 
                                 stop_after_reaching=100000,
                                 max_clauses=2,
                                 output_name='collocations-freqs.txt'):
    
    
    maxlen = maxlen[:]
    terms = {}
    for fn in DISCOVER_PHRASES_FIELDS:
        terms[fn] = []
    
    fields = set()
    for term in csv_reader('term-freqs.txt', generic=True):
        if len(term) != 4:
            continue
        if term[1] == 'high' and term[0] in DISCOVER_PHRASES_FIELDS:
            terms[term[0]].append(term[2])
            fields.add(term[0])
    
    tally = {}
    freqs = {}
    for x in maxlen:
        tally[x] = 0
        freqs[x] = {}
        for f in fields:
            freqs[x][f] = defaultdict(lambda: 0)
            
    for field, terms in terms.items():
        if len(maxlen) == 0:
            break
        for term in terms:
            rsp = req('%s/query' % options.query_endpoint, **{
                    'q' : '{}:"{}"'.format(field, term),
                    'wt': 'json',
                    'hl': 'true',
                    'hl.fl': field,
                    'hl.requireFieldMatch': 'true',
                    'hl.simple.re': '<em>',
                    'hl.simple.post': '</em>',
                    })
            if rsp['response'].get('numFound', 0) > 0:
                hls = extract_highlights(term, rsp['highlighting'])
                for f in maxlen:
                    for left in hls.get_all_left(f):
                        freqs[f][field][left] += 1
                        tally[f] += 1
                    for right in hls.get_all_right(f):
                        freqs[f][field][right] += 1
                        tally[f] += 1
                        
                    if tally[f] >= stop_after_reaching:
                        maxlen.pop(maxlen.index(f))
        
        
    
    for maxlen, freqx in freqs.items():
        fo, writer = csv_writer('{}.{}'.format(output_name, maxlen), ['field', 'type', 'token', 'freq'])
        for field, vals in freqx.items():
            vs = sorted(vals.items(), key=lambda x: x[1], reverse=True)
            for v, freq in vs:
                try:
                    writer.writerow([field, 'high', v, freq])
                except UnicodeEncodeError:
                    try:
                        writer.writerow([field, 'high', v.encode('utf8'), freq])
                    except UnicodeEncodeError:
                        pass
        fo.close()