def term_group(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if not request["q"]: dump = json.dumps({"original_query": "", "term_groups": []}) else: hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = '' not_started = False for kind, term in g: if kind == 'NOT': if not_started is False: not_started = True term_group += ' NOT ' elif kind == 'OR': term_group += '|' elif kind == "TERM": term_group += ' %s ' % term elif kind == "QUOTE": term_group += ' %s ' % term term_group = term_group.strip() term_groups.append(term_group) dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q}) yield dump.encode('utf8')
def term_group(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = '' not_started = False for kind, term in g: if kind == 'NOT': if not_started == False: not_started = True term_group += ' NOT ' elif kind == 'OR': term_group += '|' elif kind == "TERM": term_group += ' %s ' % term elif kind == "QUOTE": term_group += ' %s ' % term term_group = term_group.strip() term_groups.append(term_group) yield json.dumps(term_groups)
def query_lowlevel(db, param_dict): vars = [] clauses = [] # if column = _philo_id I can do a special query here for column, values in param_dict.items(): norm_path = db.path + "/frequencies/normalized_" + column + "_frequencies" for v in values: try: v = v.decode('utf-8') except: pass parsed = parse_query(v) if db.locals['debug']: print >> sys.stderr, "METADATA_TOKENS:", parsed grouped = group_terms(parsed) if db.locals['debug']: print >> sys.stderr, "METADATA_SYNTAX:", grouped expanded = expand_grouped_query(grouped, norm_path) if db.locals['debug']: print >> sys.stderr, "METADATA_SYNTAX:", expanded sql_clause = make_grouped_sql_clause(expanded, column) if db.locals['debug']: print >> sys.stderr, "SQL_SYNTAX:", sql_clause clauses.append(sql_clause) if clauses: query = "SELECT word_count, philo_id FROM toms WHERE " + \ " AND ".join("(%s)" % c for c in clauses) + ";" else: query = "SELECT word_count, philo_id FROM toms;" if db.locals['debug']: print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query, vars) results = db.dbh.execute(query, vars) return results
def format_query(q, db, config): parsed = parse_query(q) group = group_terms(parsed) all_groups = split_terms(group) # We extract every word tuple word_groups = [] for g in all_groups: for inner_g in g: word_groups.append(inner_g) last_group = word_groups.pop() # we take the last tuple for autocomplete token = last_group[1] kind = last_group[0] if word_groups: prefix = ' '.join([i[1] for i in word_groups]) + " " else: prefix = '' frequency_file = config.db_path + "/data/frequencies/normalized_word_frequencies" if kind == "TERM": expanded_token = token + '.*' grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE, db.locals['lowercase_index']) elif kind == "QUOTE": expanded_token = token[:-1] + '.*' + token[-1] grep_proc = grep_exact(expanded_token, frequency_file, subprocess.PIPE) elif kind == "NOT" or kind == "OR": return [] matches = [] len_token = len(token) for line in grep_proc.stdout: word = line.split(b'\t')[1].strip().decode('utf8') highlighted_word = highlighter(word, len_token) matches.append(highlighted_word) output_string = [] for m in matches: if kind == "QUOTE": output_string.append(prefix + '"%s"' % m) else: output_string.append(prefix + m) return output_string
def format_query(q, db, config): parsed = parse_query(q) group = group_terms(parsed) all_groups = split_terms(group) # We extract every word tuple word_groups = [] for g in all_groups: for inner_g in g: word_groups.append(inner_g) last_group = word_groups.pop() # we take the last tuple for autocomplete token = last_group[1] kind = last_group[0] if word_groups: prefix = ' '.join([i[1] for i in word_groups]) + " " else: prefix = '' frequency_file = config.db_path + "/data/frequencies/normalized_word_frequencies" if kind == "TERM": expanded_token = token + '.*' grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE, db.locals['lowercase_index']) elif kind == "QUOTE": expanded_token = token[:-1] + '.*' + token[-1] grep_proc = grep_exact(expanded_token, frequency_file, subprocess.PIPE) elif kind == "NOT" or kind == "OR": return [] matches = [] len_token = len(token.decode('utf-8')) for line in grep_proc.stdout: word = line.split('\t')[1].strip() highlighted_word = highlighter(word, len_token) matches.append(highlighted_word) output_string = [] for m in matches: if kind == "QUOTE": output_string.append(prefix + '"%s"' % m) else: output_string.append(prefix + m) return output_string
def format_query(q, db): parsed = parse_query(q) group = group_terms(parsed) all_groups = split_terms(group) # We extract every word tuple word_groups = [] for g in all_groups: for inner_g in g: word_groups.append(inner_g) last_group = word_groups.pop() ## we take the last tuple for autocomplete token = last_group[1] kind = last_group[0] if word_groups: prefix = ' '.join([i[1] for i in word_groups]) + " " else: prefix = '' if kind == "OR": return [] if kind == "QUOTE": token = token.replace('"', '') frequency_file = db.locals[ "db_path"] + "/frequencies/normalized_word_frequencies" expanded_token = token + '.*' grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE) matches = [] len_token = len(token.decode('utf-8')) for line in grep_proc.stdout: word = line.split('\t')[1] highlighted_word = highlighter(word, len_token) matches.append(highlighted_word) output_string = [] for m in matches: if kind == "QUOTE": output_string.append(prefix + '"%s"' % m) else: output_string.append(prefix + m) return output_string
def term_group(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if not request["q"]: dump = json.dumps({"original_query": "", "term_groups": []}) else: hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = '' not_started = False for kind, term in g: if kind == 'NOT': if not_started is False: not_started = True term_group += ' NOT ' elif kind == 'OR': term_group += '|' elif kind == "TERM": term_group += ' %s ' % term elif kind == "QUOTE": term_group += ' %s ' % term term_group = term_group.strip() term_groups.append(term_group) dump = json.dumps({ "term_groups": term_groups, "original_query": request.original_q }) yield dump.encode('utf8')
def format_query(q, db): parsed = parse_query(q) group = group_terms(parsed) all_groups = split_terms(group) # We extract every word tuple word_groups = [] for g in all_groups: for inner_g in g: word_groups.append(inner_g) last_group = word_groups.pop() ## we take the last tuple for autocomplete token = last_group[1] kind = last_group[0] if word_groups: prefix = ' '.join([i[1] for i in word_groups]) + " " else: prefix = '' if kind == "OR": return [] if kind == "QUOTE": token = token.replace('"', '') frequency_file = db.locals["db_path"]+"/frequencies/normalized_word_frequencies" expanded_token = token + '.*' grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE) matches = [] len_token = len(token.decode('utf-8')) for line in grep_proc.stdout: word = line.split('\t')[1] highlighted_word = highlighter(word, len_token) matches.append(highlighted_word) output_string = [] for m in matches: if kind == "QUOTE": output_string.append(prefix + '"%s"' % m) else: output_string.append(prefix + m) return output_string
def query(db, terms, corpus_file=None, corpus_size=0, method=None, method_arg=None, limit=3000, filename="", query_debug=False, sort_order=None, raw_results=False): sys.stdout.flush() tstart = datetime.now() parsed = parse_query(terms) grouped = group_terms(parsed) split = split_terms(grouped) words_per_hit = len(split) origpid = os.getpid() if not filename: hfile = str(origpid) + ".hitlist" dir = db.path + "/hitlists/" filename = filename or (dir + hfile) hl = open(filename, "w", encoding='utf8') err = open("/dev/null", "w") freq_file = db.path + "/frequencies/normalized_word_frequencies" if query_debug: print("FORKING", file=sys.stderr) pid = os.fork() if pid == 0: os.umask(0) os.chdir(dir) os.setsid() pid = os.fork() if pid > 0: os._exit(0) else: # now we're detached from the parent, and can do our work. if query_debug: print("WORKER DETACHED at ", datetime.now() - tstart, file=sys.stderr) args = ["corpus_search"] if corpus_file: args.extend(("-c", corpus_file)) if method and method_arg: args.extend(("-m", method, "-a", str(method_arg))) args.extend(("-o", "binary", db.path, )) worker = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=hl, stderr=err, env=os.environ) query_log_fh = filename + ".terms" if query_debug: print("LOGGING TERMS to " + filename + ".terms", file=sys.stderr) logger = subprocess.Popen(["tee", query_log_fh], stdin=subprocess.PIPE, stdout=worker.stdin) expand_query_not(split, freq_file, logger.stdin, db.locals["lowercase_index"]) logger.stdin.close() worker.stdin.close() returncode = worker.wait() if returncode == -11: print("SEGFAULT", file=sys.stderr) seg_flag = open(filename + ".error", "w") seg_flag.close() # do something to mark query as finished flag = open(filename + ".done", "w") flag.write(" ".join(args) + "\n") flag.close() os._exit(0) else: hl.close() return HitList.HitList(filename, words_per_hit, db, sort_order=sort_order, raw=raw_results)
def invert_grep_exact(token, in_fh, dest_fh): #don't strip accent or case, exact match only. try: grep_proc = subprocess.Popen(["egrep", "-a", "-v", b"[[:blank:]]%s$" % token[1:-1]], stdin=in_fh, stdout=dest_fh) except (UnicodeEncodeError, TypeError): grep_proc = subprocess.Popen(["egrep", "-a", "-v", b"[[:blank:]]%s$" % token[1:-1].encode('utf8')], stdin=in_fh, stdout=dest_fh) #can't wait because input isn't ready yet. return grep_proc if __name__ == "__main__": path = sys.argv[1] terms = sys.argv[2:] parsed = parse_query(" ".join(terms)) print("PARSED:", parsed, file=sys.stderr) grouped = group_terms(parsed) print("GROUPED:", grouped, file=sys.stderr) split = split_terms(grouped) print("parsed %d terms:" % len(split), split, file=sys.stderr) class Fake_DB: pass fake_db = Fake_DB() from philologic.Config import Config, db_locals_defaults, db_locals_header fake_db.path = path + "/data/" fake_db.locals = Config(fake_db.path + "/db.locals.py", db_locals_defaults, db_locals_header) fake_db.encoding = "utf-8" freq_file = path + "/data/frequencies/normalized_word_frequencies" # expand_query_not(split, freq_file, sys.stdout) hits = query(fake_db, " ".join(terms), query_debug=True, raw_results=True)
def query( db, terms, corpus_file=None, corpus_size=0, method=None, method_arg=None, limit=3000, filename="", query_debug=False, sort_order=None, raw_results=False, ): sys.stdout.flush() tstart = datetime.now() parsed = parse_query(terms) grouped = group_terms(parsed) split = split_terms(grouped) words_per_hit = len(split) origpid = os.getpid() if not filename: hfile = str(origpid) + ".hitlist" dir = db.path + "/hitlists/" filename = filename or (dir + hfile) hl = open(filename, "w", encoding="utf8") err = open("/dev/null", "w") freq_file = db.path + "/frequencies/normalized_word_frequencies" if query_debug: print("FORKING", file=sys.stderr) pid = os.fork() if pid == 0: os.umask(0) os.chdir(dir) os.setsid() pid = os.fork() if pid > 0: os._exit(0) else: # now we're detached from the parent, and can do our work. if query_debug: print("WORKER DETACHED at ", datetime.now() - tstart, file=sys.stderr) args = ["search5"] if corpus_file: args.extend(("--corpusfile", corpus_file)) args.append(db.path) if method and method_arg: args.extend((method, str(method_arg))) worker = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=hl, stderr=err, env=os.environ) # worker2 = subprocess.Popen("head -c 1", stdin=subprocess.PIPE, stdout=worker.stdin, stderr=err) query_log_fh = filename + ".terms" if query_debug: print("LOGGING TERMS to " + filename + ".terms", file=sys.stderr) logger = subprocess.Popen(["tee", query_log_fh], stdin=subprocess.PIPE, stdout=worker.stdin) expand_query_not(split, freq_file, logger.stdin, db.locals["lowercase_index"]) logger.stdin.close() worker.stdin.close() returncode = worker.wait() if returncode == -11: print("SEGFAULT", file=sys.stderr) seg_flag = open(filename + ".error", "w") seg_flag.close() # do something to mark query as finished flag = open(filename + ".done", "w") flag.write(" ".join(args) + "\n") flag.close() os._exit(0) else: hl.close() return HitList.HitList(filename, words_per_hit, db, method=method, sort_order=sort_order, raw=raw_results)
grep_proc = subprocess.Popen([ "egrep", "-a", "-v", b"[[:blank:]]%s$" % token[1:-1].encode("utf8") ], stdin=in_fh, stdout=dest_fh) # can't wait because input isn't ready yet. return grep_proc if __name__ == "__main__": path = sys.argv[1] terms = sys.argv[2:] parsed = parse_query(" ".join(terms)) print("PARSED:", parsed, file=sys.stderr) grouped = group_terms(parsed) print("GROUPED:", grouped, file=sys.stderr) split = split_terms(grouped) print("parsed %d terms:" % len(split), split, file=sys.stderr) class Fake_DB: pass fake_db = Fake_DB() from philologic.Config import Config, db_locals_defaults, db_locals_header fake_db.path = path + "/data/" fake_db.locals = Config(fake_db.path + "/db.locals.py", db_locals_defaults, db_locals_header) fake_db.encoding = "utf-8" freq_file = path + "/data/frequencies/normalized_word_frequencies"