Example #1
0
def format_query(q, db):
    parsed = parse_query(q)
    parsed_split = []
    for label,token in parsed:
        l,t = label,token
        if l == "QUOTE":
            if t[-1] != '"':
                t += '"'
            subtokens = t[1:-1].split(" ")
            parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t]
        else:
            parsed_split += [(l,t)]
    output_string = []
    label, token = parsed_split[-1]
    prefix = " ".join('"'+t[1]+'"' if t[0] == "QUOTE_S" else t[1] for t in parsed_split[:-1])
    if prefix:
        prefix = prefix + " "
    expanded = []
    if label == "QUOTE_S" or label == "TERM":
        norm_tok = token.decode("utf-8").lower()
        norm_tok = [i for i in unicodedata.normalize("NFKD",norm_tok) if not unicodedata.combining(i)]
        norm_tok = "".join(norm_tok).encode("utf-8")
        matches = word_pattern_search(norm_tok,db.locals["db_path"]+"/frequencies/normalized_word_frequencies")
        substr_token = token.decode("utf-8").lower().encode("utf-8")
        exact_matches = exact_word_pattern_search(substr_token + '.*',db.locals["db_path"]+"/frequencies/word_frequencies")
        for m in exact_matches:
            if m not in matches:
                matches.append(m)
        matches = highlighter(matches, len(norm_tok))
        for m in matches:
            if label == "QUOTE_S":
                output_string.append(prefix + '"%s"' % m)
            else:
                output_string.append(prefix + m)
    return output_string
Example #2
0
def format_query(q, db):
    parsed = parse_query(q)
    parsed_split = []
    for label,token in parsed:
        l,t = label,token
        if l == "QUOTE":
            subtokens = t[1:-1].split(" ")
            parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t]
        else:
            parsed_split += [(l,t)]
    
    output_string = []
    prior_label = "OR"
#        print parsed_split
    for label, token in parsed_split:
        if label == "QUOTE_S":
            output_string += token.split()
        elif label == "TERM":
            expanded = []
            norm_tok = token.decode("utf-8").lower()
            norm_tok = [i for i in unicodedata.normalize("NFKD",norm_tok) if not unicodedata.combining(i)]
            norm_tok = "".join(norm_tok).encode("utf-8")
            matches = word_pattern_search(norm_tok,db.locals["db_path"]+"/frequencies/normalized_word_frequencies")              
            for m in matches:
                if m not in expanded:
                    expanded += [m]                                              
            output_string += expanded
#            print >> sys.stderr, expanded
    return output_string