Ejemplo n.º 1
0
def term_group(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if not request["q"]:
        dump = json.dumps({"original_query": "", "term_groups": []})
    else:
        hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata)
        parsed = parse_query(request.q)
        group = group_terms(parsed)
        all_groups = split_terms(group)
        term_groups = []
        for g in all_groups:
            term_group = ''
            not_started = False
            for kind, term in g:
                if kind == 'NOT':
                    if not_started is False:
                        not_started = True
                        term_group += ' NOT '
                elif kind == 'OR':
                    term_group += '|'
                elif kind == "TERM":
                    term_group += ' %s ' % term
                elif kind == "QUOTE":
                    term_group += ' %s ' % term
            term_group = term_group.strip()
            term_groups.append(term_group)
        dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q})
    yield dump.encode('utf8')
Ejemplo n.º 2
0
def term_group(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    parsed = parse_query(request.q)
    group = group_terms(parsed)
    all_groups = split_terms(group)
    term_groups = []
    for g in all_groups:
        term_group = ''
        not_started = False
        for kind, term in g:
            if kind == 'NOT':
                if not_started == False:
                    not_started = True
                    term_group += ' NOT '
            elif kind == 'OR':
                term_group += '|'
            elif kind == "TERM":
                term_group += ' %s ' % term
            elif kind == "QUOTE":
                term_group += ' %s ' % term
        term_group = term_group.strip()
        term_groups.append(term_group)
    yield json.dumps(term_groups)
def query_lowlevel(db, param_dict):
    vars = []
    clauses = []
    # if column = _philo_id I can do a special query here
    for column, values in param_dict.items():
        norm_path = db.path + "/frequencies/normalized_" + column + "_frequencies"
        for v in values:
            try:
                v = v.decode('utf-8')
            except:
                pass
            parsed = parse_query(v)
            if db.locals['debug']:
                print >> sys.stderr, "METADATA_TOKENS:", parsed
            grouped = group_terms(parsed)
            if db.locals['debug']:
                print >> sys.stderr, "METADATA_SYNTAX:", grouped
            expanded = expand_grouped_query(grouped, norm_path)
            if db.locals['debug']:
                print >> sys.stderr, "METADATA_SYNTAX:", expanded
            sql_clause = make_grouped_sql_clause(expanded, column)
            if db.locals['debug']:
                print >> sys.stderr, "SQL_SYNTAX:", sql_clause
            clauses.append(sql_clause)
    if clauses:
        query = "SELECT word_count, philo_id FROM toms WHERE " + \
            " AND ".join("(%s)" % c for c in clauses) + ";"
    else:
        query = "SELECT word_count, philo_id FROM toms;"

    if db.locals['debug']:
        print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query, vars)

    results = db.dbh.execute(query, vars)
    return results
Ejemplo n.º 4
0
def format_query(q, db, config):
    parsed = parse_query(q)
    group = group_terms(parsed)
    all_groups = split_terms(group)

    # We extract every word tuple
    word_groups = []
    for g in all_groups:
        for inner_g in g:
            word_groups.append(inner_g)
    last_group = word_groups.pop()  # we take the last tuple for autocomplete
    token = last_group[1]
    kind = last_group[0]
    if word_groups:
        prefix = ' '.join([i[1] for i in word_groups]) + " "
    else:
        prefix = ''

    frequency_file = config.db_path + "/data/frequencies/normalized_word_frequencies"

    if kind == "TERM":
        expanded_token = token + '.*'
        grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE,
                              db.locals['lowercase_index'])
    elif kind == "QUOTE":
        expanded_token = token[:-1] + '.*' + token[-1]
        grep_proc = grep_exact(expanded_token, frequency_file, subprocess.PIPE)
    elif kind == "NOT" or kind == "OR":
        return []

    matches = []
    len_token = len(token)
    for line in grep_proc.stdout:
        word = line.split(b'\t')[1].strip().decode('utf8')
        highlighted_word = highlighter(word, len_token)
        matches.append(highlighted_word)

    output_string = []
    for m in matches:
        if kind == "QUOTE":
            output_string.append(prefix + '"%s"' % m)
        else:
            output_string.append(prefix + m)

    return output_string
def format_query(q, db, config):
    parsed = parse_query(q)
    group = group_terms(parsed)
    all_groups = split_terms(group)

    # We extract every word tuple
    word_groups = []
    for g in all_groups:
        for inner_g in g:
            word_groups.append(inner_g)
    last_group = word_groups.pop()  # we take the last tuple for autocomplete
    token = last_group[1]
    kind = last_group[0]
    if word_groups:
        prefix = ' '.join([i[1] for i in word_groups]) + " "
    else:
        prefix = ''

    frequency_file = config.db_path + "/data/frequencies/normalized_word_frequencies"

    if kind == "TERM":
        expanded_token = token + '.*'
        grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE,
                              db.locals['lowercase_index'])
    elif kind == "QUOTE":
        expanded_token = token[:-1] + '.*' + token[-1]
        grep_proc = grep_exact(expanded_token, frequency_file, subprocess.PIPE)
    elif kind == "NOT" or kind == "OR":
        return []

    matches = []
    len_token = len(token.decode('utf-8'))
    for line in grep_proc.stdout:
        word = line.split('\t')[1].strip()
        highlighted_word = highlighter(word, len_token)
        matches.append(highlighted_word)

    output_string = []
    for m in matches:
        if kind == "QUOTE":
            output_string.append(prefix + '"%s"' % m)
        else:
            output_string.append(prefix + m)

    return output_string
Ejemplo n.º 6
0
def format_query(q, db):
    parsed = parse_query(q)
    group = group_terms(parsed)
    all_groups = split_terms(group)

    # We extract every word tuple
    word_groups = []
    for g in all_groups:
        for inner_g in g:
            word_groups.append(inner_g)
    last_group = word_groups.pop()  ## we take the last tuple for autocomplete
    token = last_group[1]
    kind = last_group[0]
    if word_groups:
        prefix = ' '.join([i[1] for i in word_groups]) + " "
    else:
        prefix = ''

    if kind == "OR":
        return []
    if kind == "QUOTE":
        token = token.replace('"', '')
    frequency_file = db.locals[
        "db_path"] + "/frequencies/normalized_word_frequencies"

    expanded_token = token + '.*'
    grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE)

    matches = []
    len_token = len(token.decode('utf-8'))
    for line in grep_proc.stdout:
        word = line.split('\t')[1]
        highlighted_word = highlighter(word, len_token)
        matches.append(highlighted_word)

    output_string = []
    for m in matches:
        if kind == "QUOTE":
            output_string.append(prefix + '"%s"' % m)
        else:
            output_string.append(prefix + m)

    return output_string
Ejemplo n.º 7
0
def term_group(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if not request["q"]:
        dump = json.dumps({"original_query": "", "term_groups": []})
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        sort_order=request["sort_order"],
                        **request.metadata)
        parsed = parse_query(request.q)
        group = group_terms(parsed)
        all_groups = split_terms(group)
        term_groups = []
        for g in all_groups:
            term_group = ''
            not_started = False
            for kind, term in g:
                if kind == 'NOT':
                    if not_started is False:
                        not_started = True
                        term_group += ' NOT '
                elif kind == 'OR':
                    term_group += '|'
                elif kind == "TERM":
                    term_group += ' %s ' % term
                elif kind == "QUOTE":
                    term_group += ' %s ' % term
            term_group = term_group.strip()
            term_groups.append(term_group)
        dump = json.dumps({
            "term_groups": term_groups,
            "original_query": request.original_q
        })
    yield dump.encode('utf8')
Ejemplo n.º 8
0
def format_query(q, db):
    parsed = parse_query(q)
    group = group_terms(parsed)
    all_groups = split_terms(group)
    
    # We extract every word tuple
    word_groups = []
    for g in all_groups:
        for inner_g in g:
            word_groups.append(inner_g)
    last_group = word_groups.pop()  ## we take the last tuple for autocomplete
    token = last_group[1]
    kind = last_group[0]
    if word_groups:
        prefix = ' '.join([i[1] for i in word_groups]) + " "
    else:
        prefix = ''

    if kind == "OR":
        return []
    if kind == "QUOTE":
        token = token.replace('"', '')
    frequency_file = db.locals["db_path"]+"/frequencies/normalized_word_frequencies"
    
    expanded_token = token + '.*'
    grep_proc = grep_word(expanded_token, frequency_file, subprocess.PIPE)
    
    matches = []
    len_token = len(token.decode('utf-8'))
    for line in grep_proc.stdout:
        word = line.split('\t')[1]
        highlighted_word = highlighter(word, len_token)
        matches.append(highlighted_word)

    output_string = []
    for m in matches:
        if kind == "QUOTE":
            output_string.append(prefix + '"%s"' % m)
        else:
            output_string.append(prefix +  m)
    
    return output_string
Ejemplo n.º 9
0
def query(db,
          terms,
          corpus_file=None,
          corpus_size=0,
          method=None,
          method_arg=None,
          limit=3000,
          filename="",
          query_debug=False,
          sort_order=None,
          raw_results=False):
    sys.stdout.flush()
    tstart = datetime.now()

    parsed = parse_query(terms)
    grouped = group_terms(parsed)
    split = split_terms(grouped)

    words_per_hit = len(split)
    origpid = os.getpid()
    if not filename:
        hfile = str(origpid) + ".hitlist"
    dir = db.path + "/hitlists/"
    filename = filename or (dir + hfile)
    hl = open(filename, "w", encoding='utf8')
    err = open("/dev/null", "w")
    freq_file = db.path + "/frequencies/normalized_word_frequencies"
    if query_debug:
        print("FORKING", file=sys.stderr)
    pid = os.fork()
    if pid == 0:
        os.umask(0)
        os.chdir(dir)
        os.setsid()
        pid = os.fork()
        if pid > 0:
            os._exit(0)
        else:
            # now we're detached from the parent, and can do our work.
            if query_debug:
                print("WORKER DETACHED at ", datetime.now() - tstart, file=sys.stderr)
            args = ["corpus_search"]
            if corpus_file:
                args.extend(("-c", corpus_file))
            if method and method_arg:
                args.extend(("-m", method, "-a", str(method_arg)))

            args.extend(("-o", "binary", db.path, ))

            worker = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=hl, stderr=err, env=os.environ)

            query_log_fh = filename + ".terms"
            if query_debug:
                print("LOGGING TERMS to " + filename + ".terms", file=sys.stderr)
            logger = subprocess.Popen(["tee", query_log_fh], stdin=subprocess.PIPE, stdout=worker.stdin)
            expand_query_not(split, freq_file, logger.stdin, db.locals["lowercase_index"])
            logger.stdin.close()
            worker.stdin.close()

            returncode = worker.wait()

            if returncode == -11:
                print("SEGFAULT", file=sys.stderr)
                seg_flag = open(filename + ".error", "w")
                seg_flag.close()
            # do something to mark query as finished
            flag = open(filename + ".done", "w")
            flag.write(" ".join(args) + "\n")
            flag.close()
            os._exit(0)
    else:
        hl.close()
        return HitList.HitList(filename, words_per_hit, db, sort_order=sort_order, raw=raw_results)
Ejemplo n.º 10
0
def invert_grep_exact(token, in_fh, dest_fh):
    #don't strip accent or case, exact match only.
    try:
        grep_proc = subprocess.Popen(["egrep", "-a", "-v", b"[[:blank:]]%s$" % token[1:-1]], stdin=in_fh, stdout=dest_fh)
    except (UnicodeEncodeError, TypeError):
        grep_proc = subprocess.Popen(["egrep", "-a", "-v", b"[[:blank:]]%s$" % token[1:-1].encode('utf8')], stdin=in_fh, stdout=dest_fh)
    #can't wait because input isn't ready yet.
    return grep_proc


if __name__ == "__main__":
    path = sys.argv[1]
    terms = sys.argv[2:]
    parsed = parse_query(" ".join(terms))
    print("PARSED:", parsed, file=sys.stderr)
    grouped = group_terms(parsed)
    print("GROUPED:", grouped, file=sys.stderr)
    split = split_terms(grouped)
    print("parsed %d terms:" % len(split), split, file=sys.stderr)

    class Fake_DB:
        pass

    fake_db = Fake_DB()
    from philologic.Config import Config, db_locals_defaults, db_locals_header
    fake_db.path = path + "/data/"
    fake_db.locals = Config(fake_db.path + "/db.locals.py", db_locals_defaults, db_locals_header)
    fake_db.encoding = "utf-8"
    freq_file = path + "/data/frequencies/normalized_word_frequencies"
    # expand_query_not(split, freq_file, sys.stdout)
    hits = query(fake_db, " ".join(terms), query_debug=True, raw_results=True)
Ejemplo n.º 11
0
def query(
    db,
    terms,
    corpus_file=None,
    corpus_size=0,
    method=None,
    method_arg=None,
    limit=3000,
    filename="",
    query_debug=False,
    sort_order=None,
    raw_results=False,
):
    sys.stdout.flush()
    tstart = datetime.now()

    parsed = parse_query(terms)
    grouped = group_terms(parsed)
    split = split_terms(grouped)

    words_per_hit = len(split)
    origpid = os.getpid()
    if not filename:
        hfile = str(origpid) + ".hitlist"
    dir = db.path + "/hitlists/"
    filename = filename or (dir + hfile)
    hl = open(filename, "w", encoding="utf8")
    err = open("/dev/null", "w")
    freq_file = db.path + "/frequencies/normalized_word_frequencies"
    if query_debug:
        print("FORKING", file=sys.stderr)
    pid = os.fork()
    if pid == 0:
        os.umask(0)
        os.chdir(dir)
        os.setsid()
        pid = os.fork()
        if pid > 0:
            os._exit(0)
        else:
            # now we're detached from the parent, and can do our work.
            if query_debug:
                print("WORKER DETACHED at ",
                      datetime.now() - tstart,
                      file=sys.stderr)
            args = ["search5"]
            if corpus_file:
                args.extend(("--corpusfile", corpus_file))
            args.append(db.path)
            if method and method_arg:
                args.extend((method, str(method_arg)))

            worker = subprocess.Popen(args,
                                      stdin=subprocess.PIPE,
                                      stdout=hl,
                                      stderr=err,
                                      env=os.environ)
            # worker2 = subprocess.Popen("head -c 1", stdin=subprocess.PIPE, stdout=worker.stdin, stderr=err)

            query_log_fh = filename + ".terms"
            if query_debug:
                print("LOGGING TERMS to " + filename + ".terms",
                      file=sys.stderr)
            logger = subprocess.Popen(["tee", query_log_fh],
                                      stdin=subprocess.PIPE,
                                      stdout=worker.stdin)
            expand_query_not(split, freq_file, logger.stdin,
                             db.locals["lowercase_index"])
            logger.stdin.close()
            worker.stdin.close()

            returncode = worker.wait()

            if returncode == -11:
                print("SEGFAULT", file=sys.stderr)
                seg_flag = open(filename + ".error", "w")
                seg_flag.close()
            # do something to mark query as finished
            flag = open(filename + ".done", "w")
            flag.write(" ".join(args) + "\n")
            flag.close()
            os._exit(0)
    else:
        hl.close()
        return HitList.HitList(filename,
                               words_per_hit,
                               db,
                               method=method,
                               sort_order=sort_order,
                               raw=raw_results)
Ejemplo n.º 12
0
        grep_proc = subprocess.Popen([
            "egrep", "-a", "-v",
            b"[[:blank:]]%s$" % token[1:-1].encode("utf8")
        ],
                                     stdin=in_fh,
                                     stdout=dest_fh)
    # can't wait because input isn't ready yet.
    return grep_proc


if __name__ == "__main__":
    path = sys.argv[1]
    terms = sys.argv[2:]
    parsed = parse_query(" ".join(terms))
    print("PARSED:", parsed, file=sys.stderr)
    grouped = group_terms(parsed)
    print("GROUPED:", grouped, file=sys.stderr)
    split = split_terms(grouped)
    print("parsed %d terms:" % len(split), split, file=sys.stderr)

    class Fake_DB:
        pass

    fake_db = Fake_DB()
    from philologic.Config import Config, db_locals_defaults, db_locals_header

    fake_db.path = path + "/data/"
    fake_db.locals = Config(fake_db.path + "/db.locals.py", db_locals_defaults,
                            db_locals_header)
    fake_db.encoding = "utf-8"
    freq_file = path + "/data/frequencies/normalized_word_frequencies"